Loading...
Loading...
How to launch distributed Megatron-LM training jobs on a SLURM cluster. Covers a minimal sbatch skeleton, environment-variable setup for torch.distributed.run, CUDA_DEVICE_MAX_CONNECTIONS rules across hardware and parallelism modes, container conventions, monitoring, and per-rank failure diagnosis.
npx skill4agent add nvidia/skills run-on-slurmuvuv sync --extra training --extra dev--extra lts.venvrun_megatron.slurm#!/bin/bash
#SBATCH --job-name=megatron
#SBATCH --account=<SLURM_ACCOUNT>
#SBATCH --partition=<SLURM_PARTITION>
#SBATCH --nodes=<NODES>
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=<GPUS_PER_NODE>
#SBATCH --time=<HH:MM:SS>
#SBATCH --output=logs/%x-%j.out
#SBATCH --error=logs/%x-%j.err
set -euo pipefail
cd <MEGATRON_WORKTREE>
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
export MASTER_PORT=${MASTER_PORT:-29500}
export NNODES=${SLURM_NNODES}
export GPUS_PER_NODE=<GPUS_PER_NODE>
export WORLD_SIZE=$((NNODES * GPUS_PER_NODE))
# Set CUDA_DEVICE_MAX_CONNECTIONS only when your configuration requires it
# (see the section below). Example for pre-Blackwell with TP>1 or CP>1
# (non-FSDP):
# export CUDA_DEVICE_MAX_CONNECTIONS=1
srun --ntasks=${NNODES} --ntasks-per-node=1 bash -c '
# NODE_RANK comes from SLURM_NODEID with one task per node.
NODE_RANK=${SLURM_NODEID}
uv run python -m torch.distributed.run \
--nnodes='"${NNODES}"' \
--nproc-per-node='"${GPUS_PER_NODE}"' \
--node-rank=${NODE_RANK} \
--master-addr='"${MASTER_ADDR}"' \
--master-port='"${MASTER_PORT}"' \
pretrain_gpt.py \
<MEGATRON_ARGS>
'mkdir -p logs && JOB_ID=$(sbatch --parsable run_megatron.slurm)
echo "Submitted ${JOB_ID}"cdtorchrun--nproc-per-node1111overlap_moe_expert_parallel_comm32.venvdocker/.ngc_version.dev.ngc_version.ltssrun--container-image=…--container-mounts=…squeue -j "$JOB_ID" -o "%.10i %.8T %.10M %.6D %R"
sacct -j "$JOB_ID" --format=JobID,State,ExitCode,Elapsed
scancel "$JOB_ID"squeueWORLD_SIZE = TP × DP × CP × PPnum_attention_heads % TP == 0uv syncPYTHONPATHcd <MEGATRON_WORKTREE>MASTER_ADDRuv syncsrunCUDA_DEVICE_MAX_CONNECTIONS=11torchrunuv run python -m torch.distributed.runtorchrun