Loading...
Loading...
Operational guide for enabling TP, DP, and PP communication overlap in Megatron-Bridge, including config knobs, code anchors, pitfalls, and verification.
npx skill4agent add nvidia/skills perf-tp-dp-comm-overlapfrom megatron.bridge.training.comm_overlap import CommOverlapConfig
cfg.model.tensor_model_parallel_size = 4
cfg.model.sequence_parallel = True
cfg.model.pipeline_model_parallel_size = 4
cfg.model.virtual_pipeline_model_parallel_size = 2
cfg.comm_overlap = CommOverlapConfig(
tp_comm_overlap=True,
)
cfg.ddp.use_distributed_optimizer = True
cfg.ddp.overlap_grad_reduce = True
cfg.ddp.overlap_param_gather = Truefrom megatron.bridge.training.comm_overlap import userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048
cfg.comm_overlap.tp_comm_overlap_cfg = userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048cfg.mixed_precision.grad_reduce_in_fp32 = False
cfg.mixed_precision.fp8_param_gather = Falseif self.user_comm_overlap_cfg.tp_comm_overlap is True:
if model_cfg.tensor_model_parallel_size < 2:
...
elif not model_cfg.sequence_parallel:
...
elif not HAVE_TE:
...if model_cfg.pipeline_model_parallel_size > 1:
if vp_size > 1:
comm_overlap_cfg.overlap_p2p_comm = True
comm_overlap_cfg.batch_p2p_comm = False
else:
comm_overlap_cfg.overlap_p2p_comm = False
comm_overlap_cfg.batch_p2p_comm = Trueif self.data_parallel_size > 1:
comm_overlap_cfg.bucket_size = 128 * 1024 * 1024
comm_overlap_cfg.overlap_grad_reduce = True
comm_overlap_cfg.overlap_param_gather = Trueexecutor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(cuda_device_max_connections)
...
executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)sequence_parallel=Falseoverlap_p2p_comm=TruePP > 1VPP > 1bucket_sizegrad_reduce_in_fp32fp8_param_gatherCUDA_DEVICE_MAX_CONNECTIONSCommOverlapConfiguv run python -m pytest tests/unit_tests/training/test_comm_overlap.py -qnemo_runuv run python -m pytest tests/unit_tests/recipes/test_run_plugins.py -q26 passed