Loading...
Loading...
Expert GPU optimization for modern consumer GPUs (8-24GB VRAM). Use this skill when you need to optimize GPU training, speed up CUDA code, reduce OOM errors, tune XGBoost for GPU, migrate NumPy to CuPy, make a model faster, manage GPU memory, optimize VRAM usage, or benchmark PyTorch. Covers mixed precision, gradient checkpointing, XGBoost GPU acceleration, CuPy/cuDF migration, vectorization, torch.compile, and diagnostics. NVIDIA GPUs only. PyTorch, XGBoost, and RAPIDS frameworks.
npx skill4agent add mathews-tom/praxis-skills gpu-optimizer| Property | Your Value |
|---|---|
| GPU model | (e.g., RTX 4080 Mobile, RTX 3090, RTX 4090) |
| VRAM | (e.g., 12GB, 16GB, 24GB) |
| CUDA version | ( |
| TDP / power limit | (laptop vs desktop affects sustained throughput) |
| Driver version | ( |
# GPU-optimized: QuantileDMatrix is 1.8x faster
dtrain = xgb.QuantileDMatrix(X_train.astype(np.float32))
dval = xgb.QuantileDMatrix(X_val.astype(np.float32))
# Standard: DMatrix (use for inference only)
dtest = xgb.DMatrix(X_test.astype(np.float32))params = {
'tree_method': 'hist', # GPU-accelerated histogram
'device': 'cuda:0', # Explicit GPU device
'max_bin': 256, # Higher bins = better splits (VRAM permitting)
'grow_policy': 'depthwise', # vs 'lossguide' for imbalanced data
'predictor': 'gpu_predictor', # GPU inference
}
# Training with explicit device
model = xgb.train(params, dtrain, num_boost_round=100)def verify_gpu():
"""Verify XGBoost GPU availability. Raises if unavailable."""
import subprocess
try:
result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError("nvidia-smi failed - no GPU available")
except FileNotFoundError:
raise RuntimeError("nvidia-smi not found - no GPU available")
build_info = xgb.build_info()
if not build_info.get("USE_CUDA"):
raise RuntimeError("XGBoost not compiled with CUDA support")# Single-pass training (reuse QuantileDMatrix across slots)
dtrain = xgb.QuantileDMatrix(X_train.astype(np.float32))
for slot_idx in range(num_slots):
dtrain.set_label(y_train[:, slot_idx]) # Reuse matrix
model = xgb.train(params, dtrain, num_boost_round=100)from torch.amp import autocast, GradScaler
# Auto-detect best precision
if torch.cuda.is_bf16_supported():
amp_dtype = torch.bfloat16 # Ampere+ GPUs support BF16
else:
amp_dtype = torch.float16
# Training step
scaler = GradScaler('cuda') if amp_dtype == torch.float16 else None
with autocast('cuda', dtype=amp_dtype):
output = model(input_ids, attention_mask)
loss = criterion(output, targets)
# Backward with scaling (FP16 only)
if scaler:
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
else:
loss.backward()
optimizer.step()# Saves ~40% VRAM, adds ~20% compute time
model.gradient_checkpointing_enable()
# For transformers:
model.base_model.model.gradient_checkpointing_enable()import torch
torch.cuda.reset_peak_memory_stats()
# ... training ...
peak_vram_gb = torch.cuda.max_memory_allocated() / 1024**3
print(f"Peak VRAM: {peak_vram_gb:.2f} GB")
# Clear cache between experiments
torch.cuda.empty_cache()# Simulate larger batch size without OOM
grad_accum_steps = max(1, target_batch_size // actual_batch_size)
for i, batch in enumerate(dataloader):
loss = model(batch) / grad_accum_steps
loss.backward()
if (i + 1) % grad_accum_steps == 0:
optimizer.step()
optimizer.zero_grad()EXPERIMENTS = [
{"batch_size": 2, "seq_len": 128, "grad_ckpt": True, "amp": "bf16"},
{"batch_size": 4, "seq_len": 256, "grad_ckpt": True, "amp": "bf16"},
{"batch_size": 8, "seq_len": 512, "grad_ckpt": False, "amp": "bf16"},
{"batch_size": 16, "seq_len": 256, "grad_ckpt": False, "amp": "bf16"},
]# Slow: Python loop
for i, token_id in enumerate(input_ids):
type_id = token_to_type[token_id]
embeddings[i] = type_embeddings[type_id]
# Fast: Vectorized
type_ids = token_to_type[input_ids] # Broadcast lookup
embeddings = type_embeddings[type_ids] # Single GPU kernelclass Model(nn.Module):
def __init__(self):
super().__init__()
# Build lookup tensors once
type_ids = torch.zeros(vocab_size, dtype=torch.long)
self.register_buffer('_type_ids', type_ids) # Stays on GPU
def forward(self, input_ids):
return self._type_ids[input_ids] # Vectorized lookup# Slow: Per-sample processing
outputs = [model(x.unsqueeze(0)) for x in batch]
# Fast: Batched
outputs = model(batch) # Single forward passimport cupy as cp
import numpy as np
# NumPy (CPU)
x = np.random.randn(10000, 1000)
y = np.dot(x, x.T)
# CuPy (GPU) - SAME API
x_gpu = cp.random.randn(10000, 1000)
y_gpu = cp.dot(x_gpu, x_gpu.T)
# Transfer back if needed
y_cpu = cp.asnumpy(y_gpu)# CuPy → PyTorch (zero-copy)
x_cupy = cp.random.randn(1000, 1000)
x_torch = torch.as_tensor(x_cupy, device='cuda')
# PyTorch → CuPy (zero-copy)
x_torch = torch.randn(1000, 1000, device='cuda')
x_cupy = cp.asarray(x_torch)uv pip install cupy-cuda12x # For CUDA 12.ximport cudf
import pandas as pd
# Pandas (CPU)
df = pd.read_csv('large.csv')
grouped = df.groupby('category')['value'].mean()
# cuDF (GPU) - SAME API
df_gpu = cudf.read_csv('large.csv')
grouped_gpu = df_gpu.groupby('category')['value'].mean()
# Transfer back
grouped_cpu = grouped_gpu.to_pandas()import cudf
import xgboost as xgb
# Load data on GPU
df = cudf.read_csv('train.csv')
X = df[feature_cols]
y = df['target']
# Create DMatrix directly from cuDF (no CPU copy)
dtrain = xgb.DMatrix(X, label=y)# RAPIDS (includes cuDF, cuML, cuGraph)
uv pip install cudf-cu12 --extra-index-url=https://pypi.nvidia.com# Check availability
use_fused = (
torch.cuda.is_available()
and "fused" in torch.optim.AdamW.__init__.__code__.co_varnames
)
optimizer = torch.optim.AdamW(
model.parameters(),
lr=1e-3,
fused=use_fused, # Single GPU kernel (2-3x faster)
)# PyTorch 2.0+ compile
if hasattr(torch, "compile"):
model = torch.compile(model, mode="reduce-overhead")# Auto-tune kernels (slower startup, faster training)
torch.backends.cudnn.benchmark = True
# Disable for determinism
torch.backends.cudnn.deterministic = Trueclass WeightedSlotLoss(nn.Module):
def __init__(self, slot_weights):
super().__init__()
self.slot_weights = torch.tensor(slot_weights)
def forward(self, logits_list, targets):
weighted_losses = []
for i, logits in enumerate(logits_list):
loss = F.cross_entropy(logits, targets[:, i])
weighted_losses.append(loss * self.slot_weights[i])
return torch.stack(weighted_losses).sum() / self.slot_weights.sum()class FocalLoss(nn.Module):
def __init__(self, gamma=2.0):
super().__init__()
self.gamma = gamma
def forward(self, logits, targets):
ce_loss = F.cross_entropy(logits, targets, reduction='none')
pt = torch.exp(-ce_loss)
focal_loss = ((1 - pt) ** self.gamma) * ce_loss
return focal_loss.mean()class Model(nn.Module):
def __init__(self):
super().__init__()
self._pos_cache = {} # {seq_len: positions}
def forward(self, x):
T = x.size(1)
if T not in self._pos_cache:
self._pos_cache[T] = torch.arange(T, device=x.device)
# Limit cache size
if len(self._pos_cache) > 10:
self._pos_cache.pop(next(iter(self._pos_cache)))
return self.pos_embed(self._pos_cache[T])def _create_causal_mask(self, T, device):
if T not in self._mask_cache:
mask = torch.triu(torch.ones(T, T), diagonal=1).bool()
self._mask_cache[T] = mask.to(device)
return self._mask_cache[T]watch -n 1 nvidia-smi # Monitor in real-timewith torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.GPU],
with_stack=True,
) as prof:
model(batch)
print(prof.key_averages().table(sort_by="cuda_time_total"))import torch.utils.bottleneck as bottleneck
bottleneck.main(['script.py'])QuantileDMatrixdevice='cuda:0'.cpu()torch.cuda.synchronize()nvidia-smisudo nvidia-smiRuntimeError: XGBoost not compiled with CUDA supportuv pip install xgboost-DUSE_CUDA=ONImportErrornvcc --versioncupy-cuda12x--extra-index-url=https://pypi.nvidia.comcudf-cu12torch.compilemodel = model