@@ -11,7 +11,10 @@ set -e
1111# Download Megatron-LM if needed
1212if [ ! -d " ${MCORE_PATH} " ]; then
1313 pushd $( dirname ${MCORE_PATH} )
14- git clone -b core_v0.16.1 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
14+ git clone https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
15+ # Megatron-LM / Megatron-FSDP commit for main branch on Apr. 7, 2026.
16+ # Necessary to support wgrad accumulate fusion and Megatron-FSDP NCCL UBR.
17+ pushd Megatron-LM && git checkout 8cbc45b6e039f300c53eb09579fc973d703455cd && popd
1518 popd
1619fi
1720
@@ -34,17 +37,17 @@ export NVTE_BWD_LAYERNORM_SM_MARGIN=0
3437export NVTE_BIAS_GELU_NVFUSION=0
3538export NVTE_BIAS_DROPOUT_FUSION=0
3639
40+ # V1 offloading has bugs that are exposed by Megatron-FSDP.
41+ # This test will focus on validating the new offloading code.
42+ # Un-set the Megatron-LM default of V1.
43+ export NVTE_CPU_OFFLOAD_V1=0
44+
3745# Megatron-LM command to run Megatron-FSDP.
38- # TODO(@cspades): Megatron-Core 0.16.1 doesn't have the NCCL UBR / double-buffer
39- # fix for wgrad accumulate fusion yet. Next version bump of Megatron-Core, add:
40- # --use-nccl-ub
41- # --fsdp-double-buffer
42- # --fsdp-manual-registration
4346python3 \
4447-m torch.distributed.launch \
4548--use_env \
4649--nnodes=1 \
47- --nproc_per_node=4 \
50+ --nproc_per_node=$( nvidia-smi -L | wc -l ) \
4851${MCORE_PATH} /pretrain_gpt.py \
4952--tensor-model-parallel-size 1 \
5053--pipeline-model-parallel-size 1 \
@@ -70,9 +73,12 @@ ${MCORE_PATH}/pretrain_gpt.py \
7073--use-precision-aware-optimizer \
7174--num-distributed-optimizer-instances 2 \
7275--outer-dp-sharding-strategy optim \
76+ --use-nccl-ub \
77+ --fsdp-double-buffer \
78+ --fsdp-manual-registration \
7379--fp8-format hybrid \
7480--fp8-param-gather \
75- --fp8-recipe tensorwise \
81+ --fp8-recipe mxfp8 \
7682--cpu-offloading-num-layers 1 \
7783--overlap-grad-reduce \
7884--overlap-param-gather \
0 commit comments