deep-learning

Compare original and translation side by side

🇺🇸

Original

English
🇨🇳

Translation

Chinese

Deep Learning

深度学习

Production-grade deep learning with PyTorch, neural network architectures, and modern training practices.
基于PyTorch、神经网络架构与现代训练实践的生产级深度学习方案。

Quick Start

快速开始

python
undefined
python
undefined

PyTorch Production Training Loop

PyTorch Production Training Loop

import torch import torch.nn as nn from torch.utils.data import DataLoader from torch.optim import AdamW from torch.optim.lr_scheduler import CosineAnnealingLR import wandb
class TransformerClassifier(nn.Module): def init(self, vocab_size: int, d_model: int = 256, n_heads: int = 8, n_classes: int = 2): super().init() self.embedding = nn.Embedding(vocab_size, d_model) self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model)) encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward=1024, batch_first=True) self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6) self.classifier = nn.Linear(d_model, n_classes) self.dropout = nn.Dropout(0.1)
def forward(self, x, mask=None):
    x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
    x = self.dropout(x)
    x = self.transformer(x, src_key_padding_mask=mask)
    x = x.mean(dim=1)  # Global average pooling
    return self.classifier(x)
import torch import torch.nn as nn from torch.utils.data import DataLoader from torch.optim import AdamW from torch.optim.lr_scheduler import CosineAnnealingLR import wandb
class TransformerClassifier(nn.Module): def init(self, vocab_size: int, d_model: int = 256, n_heads: int = 8, n_classes: int = 2): super().init() self.embedding = nn.Embedding(vocab_size, d_model) self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model)) encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward=1024, batch_first=True) self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6) self.classifier = nn.Linear(d_model, n_classes) self.dropout = nn.Dropout(0.1)
def forward(self, x, mask=None):
    x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
    x = self.dropout(x)
    x = self.transformer(x, src_key_padding_mask=mask)
    x = x.mean(dim=1)  # Global average pooling
    return self.classifier(x)

Training configuration

Training configuration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = TransformerClassifier(vocab_size=30000).to(device) optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01) scheduler = CosineAnnealingLR(optimizer, T_max=10) criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = TransformerClassifier(vocab_size=30000).to(device) optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01) scheduler = CosineAnnealingLR(optimizer, T_max=10) criterion = nn.CrossEntropyLoss()

Training loop with mixed precision

Training loop with mixed precision

scaler = torch.cuda.amp.GradScaler()
for epoch in range(10): model.train() for batch in train_loader: optimizer.zero_grad() with torch.cuda.amp.autocast(): logits = model(batch["input_ids"].to(device)) loss = criterion(logits, batch["labels"].to(device)) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step()
undefined
scaler = torch.cuda.amp.GradScaler()
for epoch in range(10): model.train() for batch in train_loader: optimizer.zero_grad() with torch.cuda.amp.autocast(): logits = model(batch["input_ids"].to(device)) loss = criterion(logits, batch["labels"].to(device)) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step()
undefined

Core Concepts

核心概念

1. Modern Neural Network Architectures

1. 现代神经网络架构

python
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    """Residual block with skip connection."""
    def __init__(self, channels: int):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)

    def forward(self, x):
        residual = x
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))
        return F.relu(x + residual)

class AttentionBlock(nn.Module):
    """Multi-head self-attention."""
    def __init__(self, d_model: int, n_heads: int = 8):
        super().__init__()
        self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.norm = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        attn_out, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.norm(x + attn_out)
        return self.norm2(x + self.ffn(x))
python
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    """Residual block with skip connection."""
    def __init__(self, channels: int):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)

    def forward(self, x):
        residual = x
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))
        return F.relu(x + residual)

class AttentionBlock(nn.Module):
    """Multi-head self-attention."""
    def __init__(self, d_model: int, n_heads: int = 8):
        super().__init__()
        self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.norm = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        attn_out, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.norm(x + attn_out)
        return self.norm2(x + self.ffn(x))

2. Training Best Practices

2. 训练最佳实践

python
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR
python
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR

Gradient clipping and accumulation

Gradient clipping and accumulation

def train_epoch(model, loader, optimizer, accumulation_steps=4): model.train() optimizer.zero_grad()
for i, batch in enumerate(loader):
    with torch.cuda.amp.autocast():
        loss = model(batch) / accumulation_steps

    scaler.scale(loss).backward()

    if (i + 1) % accumulation_steps == 0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
def train_epoch(model, loader, optimizer, accumulation_steps=4): model.train() optimizer.zero_grad()
for i, batch in enumerate(loader):
    with torch.cuda.amp.autocast():
        loss = model(batch) / accumulation_steps

    scaler.scale(loss).backward()

    if (i + 1) % accumulation_steps == 0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

Early stopping

Early stopping

class EarlyStopping: def init(self, patience: int = 5, min_delta: float = 0.001): self.patience = patience self.min_delta = min_delta self.counter = 0 self.best_loss = float('inf')
def __call__(self, val_loss: float) -> bool:
    if val_loss < self.best_loss - self.min_delta:
        self.best_loss = val_loss
        self.counter = 0
    else:
        self.counter += 1
    return self.counter >= self.patience
class EarlyStopping: def init(self, patience: int = 5, min_delta: float = 0.001): self.patience = patience self.min_delta = min_delta self.counter = 0 self.best_loss = float('inf')
def __call__(self, val_loss: float) -> bool:
    if val_loss < self.best_loss - self.min_delta:
        self.best_loss = val_loss
        self.counter = 0
    else:
        self.counter += 1
    return self.counter >= self.patience

Learning rate finder

Learning rate finder

def find_lr(model, loader, optimizer, start_lr=1e-7, end_lr=10, num_iter=100): lrs, losses = [], [] lr_mult = (end_lr / start_lr) ** (1 / num_iter)
for i, batch in enumerate(loader):
    if i >= num_iter:
        break

    lr = start_lr * (lr_mult ** i)
    for pg in optimizer.param_groups:
        pg['lr'] = lr

    loss = train_step(model, batch, optimizer)
    lrs.append(lr)
    losses.append(loss)

return lrs, losses
undefined
def find_lr(model, loader, optimizer, start_lr=1e-7, end_lr=10, num_iter=100): lrs, losses = [], [] lr_mult = (end_lr / start_lr) ** (1 / num_iter)
for i, batch in enumerate(loader):
    if i >= num_iter:
        break

    lr = start_lr * (lr_mult ** i)
    for pg in optimizer.param_groups:
        pg['lr'] = lr

    loss = train_step(model, batch, optimizer)
    lrs.append(lr)
    losses.append(loss)

return lrs, losses
undefined

3. Model Deployment

3. 模型部署

python
import torch.onnx
import onnxruntime as ort
python
import torch.onnx
import onnxruntime as ort

Export to ONNX

Export to ONNX

def export_to_onnx(model, sample_input, path="model.onnx"): model.eval() torch.onnx.export( model, sample_input, path, export_params=True, opset_version=17, do_constant_folding=True, input_names=['input'], output_names=['output'], dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}} )
def export_to_onnx(model, sample_input, path="model.onnx"): model.eval() torch.onnx.export( model, sample_input, path, export_params=True, opset_version=17, do_constant_folding=True, input_names=['input'], output_names=['output'], dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}} )

ONNX Runtime inference

ONNX Runtime inference

class ONNXPredictor: def init(self, model_path: str): self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
def predict(self, input_data):
    return self.session.run(None, {'input': input_data})[0]
class ONNXPredictor: def init(self, model_path: str): self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
def predict(self, input_data):
    return self.session.run(None, {'input': input_data})[0]

TorchScript for production

TorchScript for production

scripted_model = torch.jit.script(model) scripted_model.save("model_scripted.pt")
undefined
scripted_model = torch.jit.script(model) scripted_model.save("model_scripted.pt")
undefined

Tools & Technologies

工具与技术

ToolPurposeVersion (2025)
PyTorchDeep learning framework2.2+
PyTorch LightningTraining framework2.2+
Hugging FaceTransformers, datasets4.38+
ONNX RuntimeModel inference1.17+
TensorRTGPU optimization8.6+
Weights & BiasesExperiment trackingLatest
RayDistributed training2.9+
工具用途版本(2025)
PyTorch深度学习框架2.2+
PyTorch Lightning训练框架2.2+
Hugging FaceTransformer模型与数据集4.38+
ONNX Runtime模型推理1.17+
TensorRTGPU优化8.6+
Weights & Biases实验追踪最新版
Ray分布式训练2.9+

Troubleshooting Guide

故障排除指南

IssueSymptomsRoot CauseFix
Vanishing GradientLoss not decreasingDeep network, wrong activationUse ReLU/GELU, residual connections
Exploding GradientNaN lossLearning rate too highGradient clipping, lower LR
OverfittingTrain >> Val accuracyModel too complexDropout, regularization, data aug
OOM ErrorCUDA out of memoryBatch too largeReduce batch, gradient accumulation
Slow TrainingLow GPU utilizationData loading bottleneckMore workers, prefetch
问题症状根本原因解决方法
梯度消失损失值不再下降网络过深、激活函数选择不当使用ReLU/GELU激活函数、添加残差连接
梯度爆炸损失值为NaN学习率过高梯度裁剪、降低学习率
过拟合训练准确率远高于验证准确率模型过于复杂使用Dropout、正则化、数据增强
OOM错误CUDA内存不足批次尺寸过大减小批次尺寸、使用梯度累积
训练缓慢GPU利用率低数据加载瓶颈增加工作线程数、预取数据

Debug Commands

调试命令

python
undefined
python
undefined

Check GPU memory

Check GPU memory

print(torch.cuda.memory_summary())
print(torch.cuda.memory_summary())

Profile training

Profile training

with torch.profiler.profile( activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA] ) as prof: train_step(model, batch, optimizer) print(prof.key_averages().table(sort_by="cuda_time_total"))
with torch.profiler.profile( activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA] ) as prof: train_step(model, batch, optimizer) print(prof.key_averages().table(sort_by="cuda_time_total"))

Gradient flow check

Gradient flow check

for name, param in model.named_parameters(): if param.grad is not None: print(f"{name}: grad_mean={param.grad.mean():.6f}")
undefined
for name, param in model.named_parameters(): if param.grad is not None: print(f"{name}: grad_mean={param.grad.mean():.6f}")
undefined

Best Practices

最佳实践

python
undefined
python
undefined

✅ DO: Use mixed precision training

✅ DO: Use mixed precision training

with torch.cuda.amp.autocast(): output = model(input)
with torch.cuda.amp.autocast(): output = model(input)

✅ DO: Initialize weights properly

✅ DO: Initialize weights properly

def init_weights(m): if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight)
def init_weights(m): if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight)

✅ DO: Use gradient checkpointing for large models

✅ DO: Use gradient checkpointing for large models

from torch.utils.checkpoint import checkpoint x = checkpoint(self.layer, x)
from torch.utils.checkpoint import checkpoint x = checkpoint(self.layer, x)

✅ DO: Freeze base model for fine-tuning

✅ DO: Freeze base model for fine-tuning

for param in model.base.parameters(): param.requires_grad = False
for param in model.base.parameters(): param.requires_grad = False

❌ DON'T: Use dropout during inference

❌ DON'T: Use dropout during inference

model.eval()
model.eval()

❌ DON'T: Forget to move data to device

❌ DON'T: Forget to move data to device

undefined
undefined

Resources

资源


Skill Certification Checklist:
  • Can build and train neural networks in PyTorch
  • Can implement attention mechanisms and transformers
  • Can use mixed precision and gradient accumulation
  • Can export models to ONNX/TorchScript
  • Can debug training issues (gradients, memory)

技能认证清单:
  • 能够使用PyTorch构建并训练神经网络
  • 能够实现注意力机制与Transformer模型
  • 能够使用混合精度训练与梯度累积
  • 能够将模型导出为ONNX/TorchScript格式
  • 能够调试训练问题(梯度、内存等)