Loading...
Loading...
Validate and use CUDA graph capture in Megatron Bridge, including local full-iteration graphs and Transformer Engine scoped graphs for attention, MLP, and MoE modules.
npx skill4agent add nvidia/skills perf-cuda-graphs | Mechanism | Scope support |
|---|---|---|
| MCore | |
| TE | |
attnmlpattn moe_router moe_preprocesslocalfull_iterationlocalcfg.model.cuda_graph_impl = "local"
cfg.model.cuda_graph_scope = ["full_iteration"]
cfg.model.cuda_graph_warmup_steps = 3
cfg.model.use_te_rng_tracker = True
cfg.rng.te_rng_tracker = True
cfg.rerun_state_machine.check_for_nan_in_loss = False
cfg.ddp.check_for_nan_in_grad = Falsecfg.model.cuda_graph_impl = "transformer_engine"
cfg.model.cuda_graph_scope = ["attn"] # or ["attn", "mlp"]
cfg.model.cuda_graph_warmup_steps = 3
cfg.model.use_te_rng_tracker = True
cfg.rng.te_rng_tracker = Truecfg.model.cuda_graph_impl = "transformer_engine"
cfg.model.cuda_graph_scope = ["attn", "moe_router", "moe_preprocess"]
cfg.model.cuda_graph_warmup_steps = 3
cfg.model.use_te_rng_tracker = True
cfg.rng.te_rng_tracker = Trueuv run python scripts/performance/run_script.py \
-m qwen \
-mr qwen3_30b_a3b \
--task pretrain \
-g h100 \
-c bf16 \
-ng 16 \
--cuda_graph_impl transformer_engine \
--cuda_graph_scope attn,moe_router,moe_preprocess \
...scripts/performance/argument_parser.pyVALID_CUDA_GRAPH_IMPLS["none", "local", "transformer_engine"]VALID_CUDA_GRAPH_SCOPES["full_iteration", "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba"]--cuda_graph_scopemodel.use_te_rng_trackerrng.te_rng_tracker--cuda_graph_implnoneuse_te_rng_tracker = Truegpt_provider.pyfull_iterationcuda_graph_impl = "local"full_iterationcheck_for_nan_in_loss = Falsemoemoe_routerPYTORCH_CUDA_ALLOC_CONF=expandable_segments:TrueNCCL_GRAPH_REGISTER=0moe_preprocessmoe_router # CUDA graph scope validation: check_for_nan_in_loss must be disabled with full_iteration graph
if self.model.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in self.model.cuda_graph_scope:
assert not self.rerun_state_machine.check_for_nan_in_loss, (
"check_for_nan_in_loss must be disabled when using full_iteration CUDA graph. "
"Set rerun_state_machine.check_for_nan_in_loss=False."
)
if self.model.cuda_graph_impl == "none":
self.model.cuda_graph_scope = [] if self.cuda_graph_impl != "none":
assert getattr(self, "use_te_rng_tracker", False), (
"Transformer engine's RNG tracker is required for cudagraphs, it can be "
"enabled with use_te_rng_tracker=True'." # Capture CUDA Graphs.
cuda_graph_helper = None
if model_config.cuda_graph_impl == "transformer_engine":
cuda_graph_helper = TECudaGraphHelper(...)
# ...
if config.model.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in config.model.cuda_graph_scope:
forward_backward_func = FullCudaGraphWrapper(
forward_backward_func, cuda_graph_warmup_steps=config.model.cuda_graph_warmup_steps
) # Capture CUDA Graphs after warmup.
if (
model_config.cuda_graph_impl == "transformer_engine"
and cuda_graph_helper is not None
and not cuda_graph_helper.graphs_created()
and global_state.train_state.step - start_iteration == model_config.cuda_graph_warmup_steps
):
if model_config.cuda_graph_warmup_steps > 0 and should_toggle_forward_pre_hook:
disable_forward_pre_hook(model, param_sync=False)
cuda_graph_helper.create_cudagraphs()
if model_config.cuda_graph_warmup_steps > 0 and should_toggle_forward_pre_hook:
enable_forward_pre_hook(model)
cuda_graph_helper.cuda_graph_set_manual_hooks() _set_random_seed(
rng_config.seed,
rng_config.data_parallel_random_init,
rng_config.te_rng_tracker,
rng_config.inference_rng_tracker,
use_cudagraphable_rng=(model_config.cuda_graph_impl != "none"),
pg_collection=pg_collection,
) cuda_graph_scope = getattr(model_cfg, "cuda_graph_scope", []) or []
# ... scope parsing ...
if wgrad_in_graph_scope:
assert is_te_min_version("2.12.0"), ...
assert model_cfg.gradient_accumulation_fusion, ...
if attn_scope_enabled:
assert not model_cfg.add_bias_linear and not model_cfg.add_qkv_bias, ...def _set_cuda_graph_overrides(
recipe, cuda_graph_impl=None, cuda_graph_scope=None
):
# Sets impl, scope, and auto-enables te_rng_trackerdef _delete_cuda_graphs(cuda_graph_helper):
# Deletes FullCudaGraphWrapper and TE graph objects to free NCCL buffersCudaGraphManagermegatron/core/transformer/cuda_graphs.pyTECudaGraphHelpermegatron/core/transformer/cuda_graphs.pyFullCudaGraphWrappermegatron/core/full_cuda_graph.pyCudaGraphScopemegatron/core/transformer/enums.pyscripts/performance/configs/deepseek/deepseek_workload_base_configs.pyscripts/performance/configs/qwen/qwen3_workload_base_configs.pyscripts/performance/configs/gpt_oss/gpt_oss_workload_base_configs.py| File | Coverage |
|---|---|
| |
| |
| TE autocast with CUDA graphs |
| End-to-end local and TE graph smoke tests |
| TE + CUDA graph recipe config |
| TE + CUDA graph recipe config |
| VLM CUDA graph settings |
cuda_graph_impluse_te_rng_tracker=Truerng.te_rng_tracker=Truefull_iterationmoemoe_routermoe_routermoe_preprocessPP > 1delay_wgrad_compute=Truecuda_graph_scopegradient_accumulation_fusion=True_delete_cuda_graphs()NCCL_GRAPH_REGISTER=0PYTORCH_CUDA_ALLOC_CONF=expandable_segments:TrueCudaGraphManagerTECudaGraphHelpertransformer_config.py:1907moe_routercuda_graph_impl = "transformer_engine"transformer_config.py:1977full_iterationrecompute_granularity="full"recompute_num_layersattnmlpmoe_routerAssertionError: full recompute is only supported with full iteration CUDA graph.LLAMA3_70B_SFT_CONFIG_H100_FP8_CS_V1cuda_graph_impl= "transformer_engine"cuda_graph_scope="mlp"recompute_granularity="selective"recompute_moduleslocalfull_iterationtransformer_config.py:2001-2005attn,moe_router,moe_preprocess486.9 s42.00 s41.36 suv run python -m pytest \
tests/unit_tests/training/test_config.py -k "cuda_graph" \
tests/unit_tests/training/test_comm_overlap.py -k "cuda_graph" \
tests/unit_tests/models/test_gpt_full_te_layer_autocast_spec.py -k "cuda_graph" -quv run python -m pytest \
tests/functional_tests/recipes/test_llama_recipes_pretrain_cuda_graphs.py -qlocaltransformer_engine