deep-learning
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChineseDeep Learning
深度学习
Production-grade deep learning with PyTorch, neural network architectures, and modern training practices.
基于PyTorch、神经网络架构与现代训练实践的生产级深度学习方案。
Quick Start
快速开始
python
undefinedpython
undefinedPyTorch Production Training Loop
PyTorch Production Training Loop
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import wandb
class TransformerClassifier(nn.Module):
def init(self, vocab_size: int, d_model: int = 256, n_heads: int = 8, n_classes: int = 2):
super().init()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward=1024, batch_first=True)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6)
self.classifier = nn.Linear(d_model, n_classes)
self.dropout = nn.Dropout(0.1)
def forward(self, x, mask=None):
x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
x = self.dropout(x)
x = self.transformer(x, src_key_padding_mask=mask)
x = x.mean(dim=1) # Global average pooling
return self.classifier(x)import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import wandb
class TransformerClassifier(nn.Module):
def init(self, vocab_size: int, d_model: int = 256, n_heads: int = 8, n_classes: int = 2):
super().init()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward=1024, batch_first=True)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6)
self.classifier = nn.Linear(d_model, n_classes)
self.dropout = nn.Dropout(0.1)
def forward(self, x, mask=None):
x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
x = self.dropout(x)
x = self.transformer(x, src_key_padding_mask=mask)
x = x.mean(dim=1) # Global average pooling
return self.classifier(x)Training configuration
Training configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size=30000).to(device)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=10)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size=30000).to(device)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=10)
criterion = nn.CrossEntropyLoss()
Training loop with mixed precision
Training loop with mixed precision
scaler = torch.cuda.amp.GradScaler()
for epoch in range(10):
model.train()
for batch in train_loader:
optimizer.zero_grad()
with torch.cuda.amp.autocast():
logits = model(batch["input_ids"].to(device))
loss = criterion(logits, batch["labels"].to(device))
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
scheduler.step()
undefinedscaler = torch.cuda.amp.GradScaler()
for epoch in range(10):
model.train()
for batch in train_loader:
optimizer.zero_grad()
with torch.cuda.amp.autocast():
logits = model(batch["input_ids"].to(device))
loss = criterion(logits, batch["labels"].to(device))
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
scheduler.step()
undefinedCore Concepts
核心概念
1. Modern Neural Network Architectures
1. 现代神经网络架构
python
import torch
import torch.nn as nn
import torch.nn.functional as F
class ResidualBlock(nn.Module):
"""Residual block with skip connection."""
def __init__(self, channels: int):
super().__init__()
self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn1 = nn.BatchNorm2d(channels)
self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn2 = nn.BatchNorm2d(channels)
def forward(self, x):
residual = x
x = F.relu(self.bn1(self.conv1(x)))
x = self.bn2(self.conv2(x))
return F.relu(x + residual)
class AttentionBlock(nn.Module):
"""Multi-head self-attention."""
def __init__(self, d_model: int, n_heads: int = 8):
super().__init__()
self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
self.norm = nn.LayerNorm(d_model)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_model * 4),
nn.GELU(),
nn.Linear(d_model * 4, d_model)
)
self.norm2 = nn.LayerNorm(d_model)
def forward(self, x, mask=None):
attn_out, _ = self.attention(x, x, x, attn_mask=mask)
x = self.norm(x + attn_out)
return self.norm2(x + self.ffn(x))python
import torch
import torch.nn as nn
import torch.nn.functional as F
class ResidualBlock(nn.Module):
"""Residual block with skip connection."""
def __init__(self, channels: int):
super().__init__()
self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn1 = nn.BatchNorm2d(channels)
self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn2 = nn.BatchNorm2d(channels)
def forward(self, x):
residual = x
x = F.relu(self.bn1(self.conv1(x)))
x = self.bn2(self.conv2(x))
return F.relu(x + residual)
class AttentionBlock(nn.Module):
"""Multi-head self-attention."""
def __init__(self, d_model: int, n_heads: int = 8):
super().__init__()
self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
self.norm = nn.LayerNorm(d_model)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_model * 4),
nn.GELU(),
nn.Linear(d_model * 4, d_model)
)
self.norm2 = nn.LayerNorm(d_model)
def forward(self, x, mask=None):
attn_out, _ = self.attention(x, x, x, attn_mask=mask)
x = self.norm(x + attn_out)
return self.norm2(x + self.ffn(x))2. Training Best Practices
2. 训练最佳实践
python
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLRpython
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLRGradient clipping and accumulation
Gradient clipping and accumulation
def train_epoch(model, loader, optimizer, accumulation_steps=4):
model.train()
optimizer.zero_grad()
for i, batch in enumerate(loader):
with torch.cuda.amp.autocast():
loss = model(batch) / accumulation_steps
scaler.scale(loss).backward()
if (i + 1) % accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()def train_epoch(model, loader, optimizer, accumulation_steps=4):
model.train()
optimizer.zero_grad()
for i, batch in enumerate(loader):
with torch.cuda.amp.autocast():
loss = model(batch) / accumulation_steps
scaler.scale(loss).backward()
if (i + 1) % accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()Early stopping
Early stopping
class EarlyStopping:
def init(self, patience: int = 5, min_delta: float = 0.001):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.best_loss = float('inf')
def __call__(self, val_loss: float) -> bool:
if val_loss < self.best_loss - self.min_delta:
self.best_loss = val_loss
self.counter = 0
else:
self.counter += 1
return self.counter >= self.patienceclass EarlyStopping:
def init(self, patience: int = 5, min_delta: float = 0.001):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.best_loss = float('inf')
def __call__(self, val_loss: float) -> bool:
if val_loss < self.best_loss - self.min_delta:
self.best_loss = val_loss
self.counter = 0
else:
self.counter += 1
return self.counter >= self.patienceLearning rate finder
Learning rate finder
def find_lr(model, loader, optimizer, start_lr=1e-7, end_lr=10, num_iter=100):
lrs, losses = [], []
lr_mult = (end_lr / start_lr) ** (1 / num_iter)
for i, batch in enumerate(loader):
if i >= num_iter:
break
lr = start_lr * (lr_mult ** i)
for pg in optimizer.param_groups:
pg['lr'] = lr
loss = train_step(model, batch, optimizer)
lrs.append(lr)
losses.append(loss)
return lrs, lossesundefineddef find_lr(model, loader, optimizer, start_lr=1e-7, end_lr=10, num_iter=100):
lrs, losses = [], []
lr_mult = (end_lr / start_lr) ** (1 / num_iter)
for i, batch in enumerate(loader):
if i >= num_iter:
break
lr = start_lr * (lr_mult ** i)
for pg in optimizer.param_groups:
pg['lr'] = lr
loss = train_step(model, batch, optimizer)
lrs.append(lr)
losses.append(loss)
return lrs, lossesundefined3. Model Deployment
3. 模型部署
python
import torch.onnx
import onnxruntime as ortpython
import torch.onnx
import onnxruntime as ortExport to ONNX
Export to ONNX
def export_to_onnx(model, sample_input, path="model.onnx"):
model.eval()
torch.onnx.export(
model,
sample_input,
path,
export_params=True,
opset_version=17,
do_constant_folding=True,
input_names=['input'],
output_names=['output'],
dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
)
def export_to_onnx(model, sample_input, path="model.onnx"):
model.eval()
torch.onnx.export(
model,
sample_input,
path,
export_params=True,
opset_version=17,
do_constant_folding=True,
input_names=['input'],
output_names=['output'],
dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
)
ONNX Runtime inference
ONNX Runtime inference
class ONNXPredictor:
def init(self, model_path: str):
self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
def predict(self, input_data):
return self.session.run(None, {'input': input_data})[0]class ONNXPredictor:
def init(self, model_path: str):
self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
def predict(self, input_data):
return self.session.run(None, {'input': input_data})[0]TorchScript for production
TorchScript for production
scripted_model = torch.jit.script(model)
scripted_model.save("model_scripted.pt")
undefinedscripted_model = torch.jit.script(model)
scripted_model.save("model_scripted.pt")
undefinedTools & Technologies
工具与技术
| Tool | Purpose | Version (2025) |
|---|---|---|
| PyTorch | Deep learning framework | 2.2+ |
| PyTorch Lightning | Training framework | 2.2+ |
| Hugging Face | Transformers, datasets | 4.38+ |
| ONNX Runtime | Model inference | 1.17+ |
| TensorRT | GPU optimization | 8.6+ |
| Weights & Biases | Experiment tracking | Latest |
| Ray | Distributed training | 2.9+ |
| 工具 | 用途 | 版本(2025) |
|---|---|---|
| PyTorch | 深度学习框架 | 2.2+ |
| PyTorch Lightning | 训练框架 | 2.2+ |
| Hugging Face | Transformer模型与数据集 | 4.38+ |
| ONNX Runtime | 模型推理 | 1.17+ |
| TensorRT | GPU优化 | 8.6+ |
| Weights & Biases | 实验追踪 | 最新版 |
| Ray | 分布式训练 | 2.9+ |
Troubleshooting Guide
故障排除指南
| Issue | Symptoms | Root Cause | Fix |
|---|---|---|---|
| Vanishing Gradient | Loss not decreasing | Deep network, wrong activation | Use ReLU/GELU, residual connections |
| Exploding Gradient | NaN loss | Learning rate too high | Gradient clipping, lower LR |
| Overfitting | Train >> Val accuracy | Model too complex | Dropout, regularization, data aug |
| OOM Error | CUDA out of memory | Batch too large | Reduce batch, gradient accumulation |
| Slow Training | Low GPU utilization | Data loading bottleneck | More workers, prefetch |
| 问题 | 症状 | 根本原因 | 解决方法 |
|---|---|---|---|
| 梯度消失 | 损失值不再下降 | 网络过深、激活函数选择不当 | 使用ReLU/GELU激活函数、添加残差连接 |
| 梯度爆炸 | 损失值为NaN | 学习率过高 | 梯度裁剪、降低学习率 |
| 过拟合 | 训练准确率远高于验证准确率 | 模型过于复杂 | 使用Dropout、正则化、数据增强 |
| OOM错误 | CUDA内存不足 | 批次尺寸过大 | 减小批次尺寸、使用梯度累积 |
| 训练缓慢 | GPU利用率低 | 数据加载瓶颈 | 增加工作线程数、预取数据 |
Debug Commands
调试命令
python
undefinedpython
undefinedCheck GPU memory
Check GPU memory
print(torch.cuda.memory_summary())
print(torch.cuda.memory_summary())
Profile training
Profile training
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]
) as prof:
train_step(model, batch, optimizer)
print(prof.key_averages().table(sort_by="cuda_time_total"))
with torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]
) as prof:
train_step(model, batch, optimizer)
print(prof.key_averages().table(sort_by="cuda_time_total"))
Gradient flow check
Gradient flow check
for name, param in model.named_parameters():
if param.grad is not None:
print(f"{name}: grad_mean={param.grad.mean():.6f}")
undefinedfor name, param in model.named_parameters():
if param.grad is not None:
print(f"{name}: grad_mean={param.grad.mean():.6f}")
undefinedBest Practices
最佳实践
python
undefinedpython
undefined✅ DO: Use mixed precision training
✅ DO: Use mixed precision training
with torch.cuda.amp.autocast():
output = model(input)
with torch.cuda.amp.autocast():
output = model(input)
✅ DO: Initialize weights properly
✅ DO: Initialize weights properly
def init_weights(m):
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
def init_weights(m):
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
✅ DO: Use gradient checkpointing for large models
✅ DO: Use gradient checkpointing for large models
from torch.utils.checkpoint import checkpoint
x = checkpoint(self.layer, x)
from torch.utils.checkpoint import checkpoint
x = checkpoint(self.layer, x)
✅ DO: Freeze base model for fine-tuning
✅ DO: Freeze base model for fine-tuning
for param in model.base.parameters():
param.requires_grad = False
for param in model.base.parameters():
param.requires_grad = False
❌ DON'T: Use dropout during inference
❌ DON'T: Use dropout during inference
model.eval()
model.eval()
❌ DON'T: Forget to move data to device
❌ DON'T: Forget to move data to device
undefinedundefinedResources
资源
- PyTorch Tutorials
- Hugging Face Course
- Fast.ai
- "Deep Learning" by Goodfellow et al.
Skill Certification Checklist:
- Can build and train neural networks in PyTorch
- Can implement attention mechanisms and transformers
- Can use mixed precision and gradient accumulation
- Can export models to ONNX/TorchScript
- Can debug training issues (gradients, memory)
- PyTorch官方教程
- Hugging Face课程
- Fast.ai课程
- 《深度学习》(Goodfellow等著)
技能认证清单:
- 能够使用PyTorch构建并训练神经网络
- 能够实现注意力机制与Transformer模型
- 能够使用混合精度训练与梯度累积
- 能够将模型导出为ONNX/TorchScript格式
- 能够调试训练问题(梯度、内存等)