Loading...
Loading...
Advanced sub-skill for PyTorch focused on deep research and production engineering. Covers custom Autograd functions, module hooks, advanced initialization, Distributed Data Parallel (DDP), and performance profiling.
npx skill4agent add tondevrel/scientific-agent-skills pytorch-researchnn.Sequentialtorch.autograd.Functionregister_forward_hookDistributedDataParalleltorch.nn.init.gradbackward()retain_graph=Trueimport torch
import torch.nn as nn
import torch.nn.functional as F
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSamplerclass MySignFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input):
# Save input for backward pass
ctx.save_for_backward(input)
return torch.sign(input)
@staticmethod
def backward(ctx, grad_output):
# Straight-through estimator (STE) logic
input, = ctx.saved_tensors
grad_input = grad_output.clone()
# Custom logic: gradients pass through as if it were an identity
return grad_input
# Usage
my_sign = MySignFunction.applytorch.nn.initmodel.apply(fn)torch.profilerview()cat().item()def print_grad_norm(module, grad_input, grad_output):
print(f"Module: {module.__class__.__name__}, Grad Norm: {grad_output[0].norm().item()}")
# Attach to a specific layer
model.fc1.register_full_backward_hook(print_grad_norm)
# Extract activations (Forward Hook)
activations = {}
def get_activation(name):
def hook(model, input, output):
activations[name] = output.detach()
return hook
model.conv1.register_forward_hook(get_activation('conv1'))import torch.multiprocessing as mp
def setup(rank, world_size):
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def train(rank, world_size):
setup(rank, world_size)
model = MyModel().to(rank)
ddp_model = DDP(model, device_ids=[rank])
# Use DistributedSampler to ensure each GPU sees different data
sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
loader = DataLoader(dataset, sampler=sampler, batch_size=32)
optimizer = torch.optim.Adam(ddp_model.parameters(), lr=0.001)
# ... training loop ...
dist.destroy_process_group()
# mp.spawn(train, args=(world_size,), nprocs=world_size)from torch.profiler import profile, record_function, ProfilerActivity
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
record_shapes=True) as prof:
with record_function("model_inference"):
model(inputs)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))from torch.utils.checkpoint import checkpoint
class DeepModel(nn.Module):
def forward(self, x):
# Instead of storing all activations, recompute them during backward
x = checkpoint(self.heavy_layer_1, x)
x = checkpoint(self.heavy_layer_2, x)
return xdef init_weights(m):
if isinstance(m, nn.Linear):
# Kaiming initialization for ReLU networks
torch.nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
torch.nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Conv2d):
torch.nn.init.xavier_uniform_(m.weight)
model.apply(init_weights)# Inside training loop
loss.backward()
# Clip to prevent exploding gradients (standard in RNNs/Transformers)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.1,
steps_per_epoch=len(train_loader),
epochs=10)
for epoch in range(10):
for batch in train_loader:
train_batch()
scheduler.step() # Step every batch for OneCycleRuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation.# ❌ Problem: x += 1 (breaks backward pass)
# ✅ Solution: y = x + 1 (creates a new tensor)step()torch.cuda.empty_cache()torch.cuda.ampzero_grad()backward()step()# ✅ Correct order:
optimizer.zero_grad()
loss.backward()
optimizer.step()