Loading...
Loading...
Resiliency features in Megatron Bridge including fault tolerance, straggler detection, in-process restart, preemption, and re-run state machine.
npx skill4agent add nvidia/skills resiliencyfrom megatron.bridge.recipes.run_plugins import FaultTolerancePlugin
import nemo_run as run
task = run.Script(...)
run_plugins = [
FaultTolerancePlugin(
enable_ft_package=True,
calc_ft_timeouts=True,
num_in_job_restarts=3,
num_job_retries_on_failure=2,
initial_rank_heartbeat_timeout=1800,
rank_heartbeat_timeout=300,
)
]
run.run(task, plugins=run_plugins, executor=executor)| Plugin parameter | Default | Description |
|---|---|---|
| 3 | Max restarts within same job |
| 2 | Max new job launches on failure |
| 1800 | First heartbeat timeout (seconds) |
| 300 | Subsequent heartbeat timeout (seconds) |
from megatron.bridge.training.config import FaultToleranceConfig
cfg.ft = FaultToleranceConfig(
enable_ft_package=True,
calc_ft_timeouts=True,
simulate_fault=False,
simulated_fault_type="random",
)ft_launchertorchrunexport GROUP_RANK=0 # required for non-Slurm
ft_launcher \
--rdzv_backend=c10d --rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
--nnodes=${NUM_NODES} --nproc-per-node=${NUM_GPUS_PER_NODE} \
--ft-rank_section_timeouts=setup:600,step:180,checkpointing:420 \
--ft-rank_out_of_section_timeout=300 \
your_training_script.py| Config parameter | Default | Description |
|---|---|---|
| False | Enable fault tolerance |
| False | Auto-compute optimal timeouts |
| False | Enable fault simulation for testing |
| | |
| None | Specific rank to fault (random if None) |
| 0 | Base delay before simulating fault |
ft_state.jsoncalc_ft_timeouts=Truefrom megatron.bridge.training.config import NVRxStragglerDetectionConfig
cfg.nvrx_straggler = NVRxStragglerDetectionConfig(
enabled=True,
report_time_interval=300.0,
calc_relative_gpu_perf=True,
calc_individual_gpu_perf=True,
num_gpu_perf_scores_to_print=5,
gpu_relative_perf_threshold=0.7,
gpu_individual_perf_threshold=0.7,
stop_if_detected=False,
enable_logging=True,
)| Parameter | Default | Description |
|---|---|---|
| False | Enable straggler detection |
| 300.0 | Seconds between straggler checks |
| True | Compare ranks against each other |
| True | Track per-rank degradation over time |
| 0.7 | Threshold for relative performance (0-1) |
| 0.7 | Threshold for individual performance (0-1) |
| False | Terminate training on straggler |
| 5 | Number of best/worst scores to print |
| 1 | Profiling interval for detector |
from megatron.bridge.recipes.run_plugins import PreemptionPlugin
plugins = [
PreemptionPlugin(
preempt_time=60,
enable_exit_handler=True,
enable_exit_handler_for_data_loader=False,
)
]| Plugin parameter | Default | Description |
|---|---|---|
| 60 | Seconds before job limit to send signal |
| True | Enable signal handler in training |
| False | Enable for dataloader workers |
import signal
cfg.train.exit_signal_handler = True
cfg.train.exit_signal = signal.SIGTERM
cfg.train.exit_signal_handler_for_dataloader = Falsefrom megatron.bridge.training.config import RerunStateMachineConfig
cfg.rerun_state_machine = RerunStateMachineConfig(
rerun_mode="validate_results",
check_for_nan_in_loss=True,
check_for_spiky_loss=False,
spiky_loss_factor=10.0,
)| Parameter | Default | Description |
|---|---|---|
| | |
| True | Check for NaN in loss |
| False | Check for unexpectedly large loss |
| 10.0 | Loss flagged if > factor * max observed (increase for large models) |
from megatron.bridge.training.config import InProcessRestartConfig
cfg.inprocess_restart = InProcessRestartConfig(
enabled=True,
granularity="node",
soft_timeout=60.0,
hard_timeout=90.0,
)| Parameter | Default | Description |
|---|---|---|
| False | Enable in-process restart |
| None | Ranks executing workload (rest are warm reserves) |
| | |
| None | Max restart attempts (None = unlimited) |
| 60.0 | Detect GIL-released hangs (seconds) |
| 90.0 | Force-terminate hung ranks (seconds) |
| 30.0 | Heartbeat interval (seconds) |
| 60.0 | Missing heartbeat timeout (seconds) |
| 120.0 | Distributed barrier timeout (seconds) |
| 120.0 | Completion barrier timeout (seconds) |
| True | Clear CUDA cache during restart |
| None | Max rank faults before terminating |
| None | Directory for monitor logs |
export TORCH_CPP_LOG_LEVEL=error
export TORCH_NCCL_RETHROW_CUDA_ERRORS=0
export NCCL_NVLS_ENABLE=0hard_timeoutsrun --kill-on-bad-exit=0cfg.checkpoint.async_save = True
cfg.checkpoint.ckpt_format = "torch_dist"cfg.checkpoint.non_persistent_local_ckpt_dir = "/local/scratch/ckpt"
cfg.checkpoint.non_persistent_local_ckpt_algo = "fully_parallel"src/megatron/bridge/training/config.pyFaultToleranceConfigsrc/megatron/bridge/training/fault_tolerance.pysrc/megatron/bridge/recipes/run_plugins.pyFaultTolerancePluginscripts/performance/resiliency_plugins.pytests/unit_tests/training/test_fault_tolerance.pyexamples/training_features/resiliency/fault_tolerance/src/megatron/bridge/training/config.pyNVRxStragglerDetectionConfigsrc/megatron/bridge/training/nvrx_straggler.pysrc/megatron/bridge/training/train.pycheck_nvrx_straggler_detectiontests/unit_tests/training/test_nvrx_straggler.pytests/functional_tests/training/test_nvrx_straggler.pyexamples/training_features/resiliency/straggler_detection/src/megatron/bridge/training/config.pyInProcessRestartConfigsrc/megatron/bridge/training/inprocess_restart.pysrc/megatron/bridge/training/pretrain.pymaybe_wrap_for_inprocess_restarttests/unit_tests/training/test_inprocess_restart.pytests/functional_tests/training/test_inprocess_restart.pysrc/megatron/bridge/recipes/run_plugins.pyPreemptionPluginsrc/megatron/bridge/training/utils/sig_utils.pytests/unit_tests/recipes/test_run_plugins.pysrc/megatron/bridge/training/config.pyRerunStateMachineConfigsrc/megatron/bridge/training/initialize.pyinit_rerun_statesrc/megatron/bridge/training/checkpointing.pyschedule_async_savesrc/megatron/bridge/training/checkpointing.pyLocalCheckpointManagertests/functional_tests/training/test_local_checkpointing.pyFaultToleranceConfigft_launchertorchrunGROUP_RANK=0async_save=Trueckpt_format="torch_dist"nvrx_stragglerstop_if_detected=Truehard_timeoutcheck_for_nan_in_loss=True./examples/training_features/resiliency/fault_tolerance/run_fault_tolerance.sh
./examples/training_features/resiliency/fault_tolerance/run_fault_tolerance.sh --simulate-fault[FaultTolerance][RankMonitorServer]uv run python -m torch.distributed.run --nproc_per_node=2 \
examples/training_features/resiliency/straggler_detection/straggler_detection_example.pyGPU relative performanceGPU individual performanceScheduling async checkpoint savepytest tests/functional_tests/training/test_inprocess_restart.py -v