Checklist / 检查清单
Bug Description / Bug 描述
使用py-spy dump --pid查看 对应的进程call stack如下所示:0卡: Thread 59633 (active): "MainThread"
call (torch/_ops.py:1209)
all_reduce (vllm/distributed/device_communicators/symm_mem.py:148)
all_reduce (vllm/distributed/device_communicators/cuda_communicator.py:201)
_all_reduce_out_place (vllm/distributed/parallel_state.py:514)
all_reduce (vllm/distributed/parallel_state.py:132)
call (torch/_ops.py:819)
call (cnm2xsczclisoazvzg2o3rmqmp6zyjyllexblt7qithbgckrvifh.py:1317)
run (torch/_inductor/utils.py:3220)
call (torch/_inductor/output_code.py:638)
wrapper (torch/_functorch/_aot_autograd/runtime_wrappers.py:531)
call_func_at_runtime_with_args (torch/_functorch/_aot_autograd/utils.py:134)
runtime_wrapper (torch/_functorch/_aot_autograd/runtime_wrappers.py:357)
call (torch/_functorch/_aot_autograd/runtime_wrappers.py:1962)
forward (torch/_functorch/aot_autograd.py:1148)
_fn (torch/_dynamo/eval_frame.py:1181)
call (torch/_inductor/standalone_compile.py:122)
call (vllm/compilation/piecewise_backend.py:343)
call (vllm/compilation/cuda_graph.py:223)
forward (<eval_with_key>.110:590)
_call_impl (torch/nn/modules/module.py:1787)
_wrapped_call_impl (torch/nn/modules/module.py:1776)
call (torch/fx/graph_module.py:442)
call_wrapped (torch/fx/graph_module.py:936)
call (vllm/compilation/caching.py:198)
forward (vllm/model_executor/models/qwen3_next.py:1132)
call (torch/_dynamo/aot_compile.py:124)
call (vllm/compilation/decorators.py:402)
forward (vllm/model_executor/models/qwen3_5.py:738)
_call_impl (torch/nn/modules/module.py:1787)
_wrapped_call_impl (torch/nn/modules/module.py:1776)
call (vllm/compilation/cuda_graph.py:223)
_model_forward (vllm/v1/worker/gpu_model_runner.py:3152)
execute_model (vllm/v1/worker/gpu_model_runner.py:3639)
decorate_context (torch/utils/_contextlib.py:124)
execute_model (vllm/v1/worker/gpu_worker.py:728)
decorate_context (torch/utils/_contextlib.py:124)
execute_model (vllm/v1/worker/worker_base.py:365)
run_method (vllm/v1/serial_utils.py:459)
collective_rpc (vllm/v1/executor/uniproc_executor.py:80)
execute_model (vllm/v1/executor/uniproc_executor.py:101)
step_with_batch_queue (vllm/v1/engine/core.py:449)
get_output (vllm/v1/engine/core_client.py:285)
step (vllm/v1/engine/llm_engine.py:302)
infer (swift/infer_engine/vllm_engine.py:754)
infer (swift/infer_engine/grpo_vllm_engine.py:44)
_colocate_rollout (swift/megatron/trainers/rollout_mixin.py:457)
_rollout (swift/megatron/trainers/grpo_trainer.py:499)
_generate_completions (swift/megatron/trainers/grpo_trainer.py:481)
wrapper (swift/rlhf_trainers/utils.py:607)
_generate_and_score_completions (swift/megatron/trainers/grpo_trainer.py:379)
_replace_data_iterator (swift/megatron/trainers/grpo_trainer.py:244)
train_step (swift/megatron/trainers/base.py:811)
train (swift/megatron/trainers/base.py:596)
train (swift/megatron/trainers/grpo_trainer.py:62)
run (swift/megatron/pipelines/train/sft.py:68)
main (swift/pipelines/base.py:52)
megatron_rlhf_main (swift/megatron/pipelines/train/rlhf.py:73)
(swift/cli/_megatron/rlhf.py:7)
Thread 60196 (idle): "Thread-1"
wait (threading.py:331)
wait (threading.py:629)
run (tqdm/_monitor.py:60)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 60310 (idle): "wandb-AsyncioManager-main"
select (selectors.py:468)
_run_once (asyncio/base_events.py:1898)
run_forever (asyncio/base_events.py:608)
run_until_complete (asyncio/base_events.py:641)
run (asyncio/runners.py:118)
run (asyncio/runners.py:190)
run (wandb/sdk/lib/asyncio_compat.py:76)
_main (wandb/sdk/lib/asyncio_manager.py:234)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 60329 (idle): "Thread-8"
wait (threading.py:331)
wait (threading.py:629)
run (tqdm/_monitor.py:60)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 60558 (idle)
Thread 60559 (idle)
Thread 60560 (idle)
Thread 60561 (idle)
Thread 60562 (idle)
Thread 60563 (idle)
Thread 60564 (idle)
Thread 60565 (idle)
Thread 61015 (idle): "Thread-9 (_report_usage_worker)"
_report_continuous_usage (vllm/usage/usage_lib.py:251)
_report_usage_worker (vllm/usage/usage_lib.py:174)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61125 (idle): "Thread-10 (_pin_memory_loop)"
select (selectors.py:415)
wait (multiprocessing/connection.py:948)
_poll (multiprocessing/connection.py:440)
poll (multiprocessing/connection.py:257)
get (multiprocessing/queues.py:113)
do_one_step (torch/utils/data/_utils/pin_memory.py:28)
_pin_memory_loop (torch/utils/data/_utils/pin_memory.py:52)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61126 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61127 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61128 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61129 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61130 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61131 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61132 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61133 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61590 (idle): "WorkerAsyncOutput_0"
_worker (concurrent/futures/thread.py:81)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 63189 (idle): "Thread-11 (_append_worker)"
wait (threading.py:327)
get (queue.py:171)
_append_worker (swift/utils/io_utils.py:58)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
How to Reproduce / 如何复现
以下为训练脚本:export IMAGE_MAX_TOKEN_NUM=1600
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NPROC_PER_NODE=8
SWANLAB_API_KEY=QDnbD22xn3vlEZrEuBGXY
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
megatron rlhf
--rlhf_type grpo
--model /opt/wfs2139/model/Qwen3.5-35B-A3B
--output_dir XX
--save_safetensors
--loss_type grpo
--expert_model_parallel_size 8
--pipeline_model_parallel_size 1
--dataset xx
--external_plugins xx
--reward_funcs remote
--save_steps 50
--num_train_epochs 1
--generation_batch_size 512
--global_batch_size 512
--micro_batch_size 4
--moe_permute_fusion true
--num_generations 8
--reward_funcs remote
--use_vllm true
--enable_thinking false
--vllm_mode colocate
--vllm_gpu_memory_utilization 0.25
--vllm_tensor_parallel_size 2
--vllm_max_model_len 8192
--max_completion_length 4096
--epsilon 3e-4
--epsilon_high 4e-4
--lr 3e-6
--min_lr 1e-8
--bf16 true
--beta 0.00
--importance_sampling_level sequence
--dynamic_sample false
--overlong_filter true
--sleep_level 2
--offload_model false
--offload_optimizer true
--optimizer_cpu_offload false
--use_precision_aware_optimizer
--logging_steps 1
--recompute_granularity full
--recompute_method uniform
--recompute_num_layers 1
--finetune
--dataloader_num_workers 8
--dataset_num_proc 8
--no_save_optim
--no_save_rng
--attention_backend flash
--temperature 1.0
--padding_free false
--sequence_parallel false
--log_completions true --report_to swanlab
Additional Information / 补充信息
No response
Checklist / 检查清单
Bug Description / Bug 描述
使用py-spy dump --pid查看 对应的进程call stack如下所示:0卡: Thread 59633 (active): "MainThread"
call (torch/_ops.py:1209)
all_reduce (vllm/distributed/device_communicators/symm_mem.py:148)
all_reduce (vllm/distributed/device_communicators/cuda_communicator.py:201)
_all_reduce_out_place (vllm/distributed/parallel_state.py:514)
all_reduce (vllm/distributed/parallel_state.py:132)
call (torch/_ops.py:819)
call (cnm2xsczclisoazvzg2o3rmqmp6zyjyllexblt7qithbgckrvifh.py:1317)
run (torch/_inductor/utils.py:3220)
call (torch/_inductor/output_code.py:638)
wrapper (torch/_functorch/_aot_autograd/runtime_wrappers.py:531)
call_func_at_runtime_with_args (torch/_functorch/_aot_autograd/utils.py:134)
runtime_wrapper (torch/_functorch/_aot_autograd/runtime_wrappers.py:357)
call (torch/_functorch/_aot_autograd/runtime_wrappers.py:1962)
forward (torch/_functorch/aot_autograd.py:1148)
_fn (torch/_dynamo/eval_frame.py:1181)
call (torch/_inductor/standalone_compile.py:122)
call (vllm/compilation/piecewise_backend.py:343)
call (vllm/compilation/cuda_graph.py:223)
forward (<eval_with_key>.110:590)
_call_impl (torch/nn/modules/module.py:1787)
_wrapped_call_impl (torch/nn/modules/module.py:1776)
call (torch/fx/graph_module.py:442)
call_wrapped (torch/fx/graph_module.py:936)
call (vllm/compilation/caching.py:198)
forward (vllm/model_executor/models/qwen3_next.py:1132)
call (torch/_dynamo/aot_compile.py:124)
call (vllm/compilation/decorators.py:402)
forward (vllm/model_executor/models/qwen3_5.py:738)
_call_impl (torch/nn/modules/module.py:1787)
_wrapped_call_impl (torch/nn/modules/module.py:1776)
call (vllm/compilation/cuda_graph.py:223)
_model_forward (vllm/v1/worker/gpu_model_runner.py:3152)
execute_model (vllm/v1/worker/gpu_model_runner.py:3639)
decorate_context (torch/utils/_contextlib.py:124)
execute_model (vllm/v1/worker/gpu_worker.py:728)
decorate_context (torch/utils/_contextlib.py:124)
execute_model (vllm/v1/worker/worker_base.py:365)
run_method (vllm/v1/serial_utils.py:459)
collective_rpc (vllm/v1/executor/uniproc_executor.py:80)
execute_model (vllm/v1/executor/uniproc_executor.py:101)
step_with_batch_queue (vllm/v1/engine/core.py:449)
get_output (vllm/v1/engine/core_client.py:285)
step (vllm/v1/engine/llm_engine.py:302)
infer (swift/infer_engine/vllm_engine.py:754)
infer (swift/infer_engine/grpo_vllm_engine.py:44)
_colocate_rollout (swift/megatron/trainers/rollout_mixin.py:457)
_rollout (swift/megatron/trainers/grpo_trainer.py:499)
_generate_completions (swift/megatron/trainers/grpo_trainer.py:481)
wrapper (swift/rlhf_trainers/utils.py:607)
_generate_and_score_completions (swift/megatron/trainers/grpo_trainer.py:379)
_replace_data_iterator (swift/megatron/trainers/grpo_trainer.py:244)
train_step (swift/megatron/trainers/base.py:811)
train (swift/megatron/trainers/base.py:596)
train (swift/megatron/trainers/grpo_trainer.py:62)
run (swift/megatron/pipelines/train/sft.py:68)
main (swift/pipelines/base.py:52)
megatron_rlhf_main (swift/megatron/pipelines/train/rlhf.py:73)
(swift/cli/_megatron/rlhf.py:7)
Thread 60196 (idle): "Thread-1"
wait (threading.py:331)
wait (threading.py:629)
run (tqdm/_monitor.py:60)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 60310 (idle): "wandb-AsyncioManager-main"
select (selectors.py:468)
_run_once (asyncio/base_events.py:1898)
run_forever (asyncio/base_events.py:608)
run_until_complete (asyncio/base_events.py:641)
run (asyncio/runners.py:118)
run (asyncio/runners.py:190)
run (wandb/sdk/lib/asyncio_compat.py:76)
_main (wandb/sdk/lib/asyncio_manager.py:234)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 60329 (idle): "Thread-8"
wait (threading.py:331)
wait (threading.py:629)
run (tqdm/_monitor.py:60)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 60558 (idle)
Thread 60559 (idle)
Thread 60560 (idle)
Thread 60561 (idle)
Thread 60562 (idle)
Thread 60563 (idle)
Thread 60564 (idle)
Thread 60565 (idle)
Thread 61015 (idle): "Thread-9 (_report_usage_worker)"
_report_continuous_usage (vllm/usage/usage_lib.py:251)
_report_usage_worker (vllm/usage/usage_lib.py:174)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61125 (idle): "Thread-10 (_pin_memory_loop)"
select (selectors.py:415)
wait (multiprocessing/connection.py:948)
_poll (multiprocessing/connection.py:440)
poll (multiprocessing/connection.py:257)
get (multiprocessing/queues.py:113)
do_one_step (torch/utils/data/_utils/pin_memory.py:28)
_pin_memory_loop (torch/utils/data/_utils/pin_memory.py:52)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61126 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61127 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61128 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61129 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61130 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61131 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61132 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61133 (idle): "QueueFeederThread"
wait (threading.py:327)
_feed (multiprocessing/queues.py:231)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 61590 (idle): "WorkerAsyncOutput_0"
_worker (concurrent/futures/thread.py:81)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
Thread 63189 (idle): "Thread-11 (_append_worker)"
wait (threading.py:327)
get (queue.py:171)
_append_worker (swift/utils/io_utils.py:58)
run (threading.py:982)
_bootstrap_inner (threading.py:1045)
_bootstrap (threading.py:1002)
How to Reproduce / 如何复现
以下为训练脚本:export IMAGE_MAX_TOKEN_NUM=1600
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NPROC_PER_NODE=8
SWANLAB_API_KEY=QDnbD22xn3vlEZrEuBGXY
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True'
megatron rlhf
--rlhf_type grpo
--model /opt/wfs2139/model/Qwen3.5-35B-A3B
--output_dir XX
--save_safetensors
--loss_type grpo
--expert_model_parallel_size 8
--pipeline_model_parallel_size 1
--dataset xx
--external_plugins xx
--reward_funcs remote
--save_steps 50
--num_train_epochs 1
--generation_batch_size 512
--global_batch_size 512
--micro_batch_size 4
--moe_permute_fusion true
--num_generations 8
--reward_funcs remote
--use_vllm true
--enable_thinking false
--vllm_mode colocate
--vllm_gpu_memory_utilization 0.25
--vllm_tensor_parallel_size 2
--vllm_max_model_len 8192
--max_completion_length 4096
--epsilon 3e-4
--epsilon_high 4e-4
--lr 3e-6
--min_lr 1e-8
--bf16 true
--beta 0.00
--importance_sampling_level sequence
--dynamic_sample false
--overlong_filter true
--sleep_level 2
--offload_model false
--offload_optimizer true
--optimizer_cpu_offload false
--use_precision_aware_optimizer
--logging_steps 1
--recompute_granularity full
--recompute_method uniform
--recompute_num_layers 1
--finetune
--dataloader_num_workers 8
--dataset_num_proc 8
--no_save_optim
--no_save_rng
--attention_backend flash
--temperature 1.0
--padding_free false
--sequence_parallel false
--log_completions true --report_to swanlab
Additional Information / 补充信息
No response