Skip to content

Commit e521694

Browse files
committed
Edits.
Signed-off-by: Cory Ye <cye@nvidia.com>
1 parent dcd334a commit e521694

File tree

1 file changed

+14
-8
lines changed
  • qa/L1_pytorch_mcore_fsdp_integration

1 file changed

+14
-8
lines changed

qa/L1_pytorch_mcore_fsdp_integration/test.sh

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ set -e
1111
# Download Megatron-LM if needed
1212
if [ ! -d "${MCORE_PATH}" ]; then
1313
pushd $(dirname ${MCORE_PATH})
14-
git clone -b core_v0.16.1 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
14+
git clone https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
15+
# Megatron-LM / Megatron-FSDP commit for main branch on Apr. 7, 2026.
16+
# Necessary to support wgrad accumulate fusion and Megatron-FSDP NCCL UBR.
17+
pushd Megatron-LM && git checkout 8cbc45b6e039f300c53eb09579fc973d703455cd && popd
1518
popd
1619
fi
1720

@@ -34,17 +37,17 @@ export NVTE_BWD_LAYERNORM_SM_MARGIN=0
3437
export NVTE_BIAS_GELU_NVFUSION=0
3538
export NVTE_BIAS_DROPOUT_FUSION=0
3639

40+
# V1 offloading has bugs that are exposed by Megatron-FSDP.
41+
# This test will focus on validating the new offloading code.
42+
# Un-set the Megatron-LM default of V1.
43+
export NVTE_CPU_OFFLOAD_V1=0
44+
3745
# Megatron-LM command to run Megatron-FSDP.
38-
# TODO(@cspades): Megatron-Core 0.16.1 doesn't have the NCCL UBR / double-buffer
39-
# fix for wgrad accumulate fusion yet. Next version bump of Megatron-Core, add:
40-
# --use-nccl-ub
41-
# --fsdp-double-buffer
42-
# --fsdp-manual-registration
4346
python3 \
4447
-m torch.distributed.launch \
4548
--use_env \
4649
--nnodes=1 \
47-
--nproc_per_node=4 \
50+
--nproc_per_node=$(nvidia-smi -L | wc -l) \
4851
${MCORE_PATH}/pretrain_gpt.py \
4952
--tensor-model-parallel-size 1 \
5053
--pipeline-model-parallel-size 1 \
@@ -70,9 +73,12 @@ ${MCORE_PATH}/pretrain_gpt.py \
7073
--use-precision-aware-optimizer \
7174
--num-distributed-optimizer-instances 2 \
7275
--outer-dp-sharding-strategy optim \
76+
--use-nccl-ub \
77+
--fsdp-double-buffer \
78+
--fsdp-manual-registration \
7379
--fp8-format hybrid \
7480
--fp8-param-gather \
75-
--fp8-recipe tensorwise \
81+
--fp8-recipe mxfp8 \
7682
--cpu-offloading-num-layers 1 \
7783
--overlap-grad-reduce \
7884
--overlap-param-gather \

0 commit comments

Comments
 (0)