Loading...
Loading...
Validate and use MoE expert-parallel communication overlap in Megatron-Bridge, including overlap_moe_expert_parallel_comm, delay_wgrad_compute, and flex dispatcher backends such as DeepEP and HybridEP.
npx skill4agent add nvidia/skills nemo-mbridge-perf-expert-parallel-overlapdelay_wgrad_compute| Dispatcher | Backend | When to use |
|---|---|---|
| Standard MoE all-to-all | Default, broadest compatibility |
| DeepEP or HybridEP | Higher overlap on Ampere/Hopper/Blackwell |
EP > 1alltoallflexmoe_shared_expert_overlapcfg.comm_overlap.overlap_moe_expert_parallel_comm = True
cfg.comm_overlap.delay_wgrad_compute = True
cfg.model.moe_shared_expert_overlap = False
cfg.model.expert_model_parallel_size = 8
cfg.model.num_moe_experts = 64
cfg.model.moe_token_dispatcher_type = "alltoall"
cfg.model.bf16 = True
cfg.model.fp16 = Falsefrom megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
cfg.comm_overlap.overlap_moe_expert_parallel_comm = True
cfg.comm_overlap.delay_wgrad_compute = True
cfg.model.moe_shared_expert_overlap = False
apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend="deepep")
# or: apply_flex_dispatcher_backend(cfg.model, moe_flex_dispatcher_backend="hybridep")expert_model_parallel_size > 1num_moe_experts > 1moe_token_dispatcher_type"alltoall""flex"moe_shared_expert_overlap = False>= 2.6.0PP > 1virtual_pipeline_model_parallel_sizerecompute_granularity != "full"recompute_method = Nonerecompute_num_layers = Nonemtp_num_layersNone1delay_wgrad_computeoverlap_moe_expert_parallel_commdelay_wgrad_computeoverlap_grad_reducedelay_wgrad_computegradient_accumulation_fusionattndelay_wgrad_computegradient_accumulation_fusion = Truecfg.comm_overlap.overlap_moe_expert_parallel_comm = True
cfg.comm_overlap.delay_wgrad_compute = False
cfg.model.expert_model_parallel_size = 4
cfg.model.num_moe_experts = 64
cfg.model.moe_token_dispatcher_type = "alltoall"
cfg.model.moe_shared_expert_overlap = False
cfg.model.bf16 = TrueEP=16alltoallmoe_permute_fusion=false| Case | Steady mean | Relative |
|---|---|---|
| no EP overlap | 41.25s | 1.000x |
| EP overlap | 31.31s | 1.317x |
EP overlap plus | 31.20s | 1.322x |
uv run python scripts/performance/run_script.py \
-m qwen \
-mr qwen3_30b_a3b \
--task pretrain \
-g h100 \
-c bf16 \
-ng 16 \
-gn 8 \
--max_steps 8 \
--config_variant v1 \
--cuda_graph_impl none \
--moe_flex_dispatcher_backend None \
--moe_a2a_overlap false \
--tokenizer_type NullTokenizer \
comm_overlap.overlap_moe_expert_parallel_comm=true \
comm_overlap.delay_wgrad_compute=false \
model.moe_shared_expert_overlap=false--moe_a2a_overlap trueoverlap_moe_expert_parallel_commdelay_wgrad_computeuv run python -m pytest \
tests/unit_tests/training/test_comm_overlap.py -k "moe" \
tests/unit_tests/training/test_deepep.py -quv run python -m pytest \
tests/unit_tests/training/test_comm_overlap.py \
tests/unit_tests/training/test_deepep.py -qCommOverlapConfigoverlap_moe_expert_parallel_commTruemoe_token_dispatcher_type = "flex"if self.user_comm_overlap_cfg.overlap_moe_expert_parallel_comm is True:
assert model_cfg.expert_model_parallel_size > 1, ...
assert model_cfg.num_moe_experts > 1, ...
assert model_cfg.moe_token_dispatcher_type in ["alltoall", "flex"], ...
assert model_cfg.bf16 or model_cfg.fp16, ...
assert is_torch_min_version("2.6.0"), ...
# ... PP + VPP check, recompute checks, shared_expert_overlap check ...if self.user_comm_overlap_cfg.delay_wgrad_compute is True:
# TE version checks for overlap_grad_reduce and gradient_accumulation_fusion
# CUDA graph scope validations for delayed wgrad
assert overlap_moe_expert_parallel_comm, ...def apply_flex_dispatcher_backend(...):
# GPU architecture check for DeepEP / HybridEP
model_config.moe_token_dispatcher_type = "flex"
model_config.moe_flex_dispatcher_backend = moe_flex_dispatcher_backend
model_config.moe_shared_expert_overlap = Falsedef _set_moe_a2a_overlap_overrides(recipe, moe_a2a_overlap=False):
if moe_a2a_overlap:
recipe.comm_overlap.overlap_moe_expert_parallel_comm = True
recipe.comm_overlap.delay_wgrad_compute = True
recipe.model.moe_shared_expert_overlap = False| File | Coverage |
|---|---|
| EP overlap validation, delayed wgrad, CUDA graph + wgrad interaction |
| DeepEP/HybridEP helper activation and GPU gating |
| Symptom | Likely Cause | How To Confirm | Fix |
|---|---|---|---|
assert | EP not configured | Check | Set EP > 1 |
assert | Wrong dispatcher | Check dispatcher type | Use |
| assert on BF16/FP16 | Wrong precision | Check | Set |
| hang during training | PyTorch < 2.6 | Check PyTorch version | Upgrade to >= 2.6.0 |
assert | PP > 1 without VPP | Check PP and VPP config | Set VPP when PP > 1 |
assert | Full recompute enabled | Check recompute settings | Disable full recompute |
assert | delayed wgrad without EP overlap | Check | Enable EP overlap first |
assert | CUDA graph + delayed wgrad | Check graph scope + wgrad settings | Enable |
| assert on attention bias | CUDA graph attn + delayed wgrad + bias | Check | Disable attention bias |
| no throughput gain from flex dispatcher | | Check | Call |
| DeepEP/HybridEP silently skipped | Unsupported GPU | Check warning logs | Run on Ampere/Hopper/Blackwell |
moe_flex_dispatcher_backendapply_flex_dispatcher_backend(...)