Qwen3.5 is hanging during GRPO training, getting stuck after step 105 (out of a total of 219 steps).

### Checklist / 检查清单

- [x] I have searched existing issues, and this is a new bug report. / 我已经搜索过现有的 issues，确认这是一个新的 bug report。

### Bug Description / Bug 描述

使用py-spy dump --pid查看 对应的进程call stack如下所示：0卡： Thread 59633 (active): "MainThread"
    __call__ (torch/_ops.py:1209)
    all_reduce (vllm/distributed/device_communicators/symm_mem.py:148)
    all_reduce (vllm/distributed/device_communicators/cuda_communicator.py:201)
    _all_reduce_out_place (vllm/distributed/parallel_state.py:514)
    all_reduce (vllm/distributed/parallel_state.py:132)
    __call__ (torch/_ops.py:819)
    call (cnm2xsczclisoazvzg2o3rmqmp6zyjyllexblt7qithbgckrvifh.py:1317)
    run (torch/_inductor/utils.py:3220)
    __call__ (torch/_inductor/output_code.py:638)
    wrapper (torch/_functorch/_aot_autograd/runtime_wrappers.py:531)
    call_func_at_runtime_with_args (torch/_functorch/_aot_autograd/utils.py:134)
    runtime_wrapper (torch/_functorch/_aot_autograd/runtime_wrappers.py:357)
    __call__ (torch/_functorch/_aot_autograd/runtime_wrappers.py:1962)
    forward (torch/_functorch/aot_autograd.py:1148)
    _fn (torch/_dynamo/eval_frame.py:1181)
    __call__ (torch/_inductor/standalone_compile.py:122)
    __call__ (vllm/compilation/piecewise_backend.py:343)
    __call__ (vllm/compilation/cuda_graph.py:223)
    forward (<eval_with_key>.110:590)
    _call_impl (torch/nn/modules/module.py:1787)
    _wrapped_call_impl (torch/nn/modules/module.py:1776)
    __call__ (torch/fx/graph_module.py:442)
    call_wrapped (torch/fx/graph_module.py:936)
    __call__ (vllm/compilation/caching.py:198)
    forward (vllm/model_executor/models/qwen3_next.py:1132)
    __call__ (torch/_dynamo/aot_compile.py:124)
    __call__ (vllm/compilation/decorators.py:402)
    forward (vllm/model_executor/models/qwen3_5.py:738)
    _call_impl (torch/nn/modules/module.py:1787)
    _wrapped_call_impl (torch/nn/modules/module.py:1776)
    __call__ (vllm/compilation/cuda_graph.py:223)
    _model_forward (vllm/v1/worker/gpu_model_runner.py:3152)
    execute_model (vllm/v1/worker/gpu_model_runner.py:3639)
    decorate_context (torch/utils/_contextlib.py:124)
    execute_model (vllm/v1/worker/gpu_worker.py:728)
    decorate_context (torch/utils/_contextlib.py:124)
    execute_model (vllm/v1/worker/worker_base.py:365)
    run_method (vllm/v1/serial_utils.py:459)
    collective_rpc (vllm/v1/executor/uniproc_executor.py:80)
    execute_model (vllm/v1/executor/uniproc_executor.py:101)
    step_with_batch_queue (vllm/v1/engine/core.py:449)
    get_output (vllm/v1/engine/core_client.py:285)
    step (vllm/v1/engine/llm_engine.py:302)
    infer (swift/infer_engine/vllm_engine.py:754)
    infer (swift/infer_engine/grpo_vllm_engine.py:44)
    _colocate_rollout (swift/megatron/trainers/rollout_mixin.py:457)
    _rollout (swift/megatron/trainers/grpo_trainer.py:499)
    _generate_completions (swift/megatron/trainers/grpo_trainer.py:481)
    wrapper (swift/rlhf_trainers/utils.py:607)
    _generate_and_score_completions (swift/megatron/trainers/grpo_trainer.py:379)
    _replace_data_iterator (swift/megatron/trainers/grpo_trainer.py:244)
    train_step (swift/megatron/trainers/base.py:811)
    train (swift/megatron/trainers/base.py:596)
    train (swift/megatron/trainers/grpo_trainer.py:62)
    run (swift/megatron/pipelines/train/sft.py:68)
    main (swift/pipelines/base.py:52)
    megatron_rlhf_main (swift/megatron/pipelines/train/rlhf.py:73)
    <module> (swift/cli/_megatron/rlhf.py:7)
Thread 60196 (idle): "Thread-1"
    wait (threading.py:331)
    wait (threading.py:629)
    run (tqdm/_monitor.py:60)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 60310 (idle): "wandb-AsyncioManager-main"
    select (selectors.py:468)
    _run_once (asyncio/base_events.py:1898)
    run_forever (asyncio/base_events.py:608)
    run_until_complete (asyncio/base_events.py:641)
    run (asyncio/runners.py:118)
    run (asyncio/runners.py:190)
    run (wandb/sdk/lib/asyncio_compat.py:76)
    _main (wandb/sdk/lib/asyncio_manager.py:234)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 60329 (idle): "Thread-8"
    wait (threading.py:331)
    wait (threading.py:629)
    run (tqdm/_monitor.py:60)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 60558 (idle)
Thread 60559 (idle)
Thread 60560 (idle)
Thread 60561 (idle)
Thread 60562 (idle)
Thread 60563 (idle)
Thread 60564 (idle)
Thread 60565 (idle)
Thread 61015 (idle): "Thread-9 (_report_usage_worker)"
    _report_continuous_usage (vllm/usage/usage_lib.py:251)
    _report_usage_worker (vllm/usage/usage_lib.py:174)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 61125 (idle): "Thread-10 (_pin_memory_loop)"
    select (selectors.py:415)
    wait (multiprocessing/connection.py:948)
    _poll (multiprocessing/connection.py:440)
    poll (multiprocessing/connection.py:257)
    get (multiprocessing/queues.py:113)
    do_one_step (torch/utils/data/_utils/pin_memory.py:28)
    _pin_memory_loop (torch/utils/data/_utils/pin_memory.py:52)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 61126 (idle): "QueueFeederThread"
    wait (threading.py:327)
    _feed (multiprocessing/queues.py:231)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 61127 (idle): "QueueFeederThread"
    wait (threading.py:327)
    _feed (multiprocessing/queues.py:231)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 61128 (idle): "QueueFeederThread"
    wait (threading.py:327)
    _feed (multiprocessing/queues.py:231)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 61129 (idle): "QueueFeederThread"
    wait (threading.py:327)
    _feed (multiprocessing/queues.py:231)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 61130 (idle): "QueueFeederThread"
    wait (threading.py:327)
    _feed (multiprocessing/queues.py:231)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 61131 (idle): "QueueFeederThread"
    wait (threading.py:327)
    _feed (multiprocessing/queues.py:231)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 61132 (idle): "QueueFeederThread"
    wait (threading.py:327)
    _feed (multiprocessing/queues.py:231)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 61133 (idle): "QueueFeederThread"
    wait (threading.py:327)
    _feed (multiprocessing/queues.py:231)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 61590 (idle): "WorkerAsyncOutput_0"
    _worker (concurrent/futures/thread.py:81)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)
Thread 63189 (idle): "Thread-11 (_append_worker)"
    wait (threading.py:327)
    get (queue.py:171)
    _append_worker (swift/utils/io_utils.py:58)
    run (threading.py:982)
    _bootstrap_inner (threading.py:1045)
    _bootstrap (threading.py:1002)  

### How to Reproduce / 如何复现

以下为训练脚本：export IMAGE_MAX_TOKEN_NUM=1600
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NPROC_PER_NODE=8 \
SWANLAB_API_KEY=QDnbD22xn3vlEZrEuBGXY \
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
megatron rlhf \
    --rlhf_type grpo \
    --model /opt/wfs2139/model/Qwen3.5-35B-A3B \
    --output_dir XX
    --save_safetensors  \
    --loss_type grpo \
    --expert_model_parallel_size 8 \
    --pipeline_model_parallel_size 1 \
    --dataset xx  \
    --external_plugins xx \
    --reward_funcs remote \
    --save_steps 50 \
    --num_train_epochs 1 \
    --generation_batch_size 512 \
    --global_batch_size 512 \
    --micro_batch_size 4 \
    --moe_permute_fusion true \
    --num_generations 8 \
    --reward_funcs remote \
    --use_vllm true \
    --enable_thinking false \
    --vllm_mode colocate \
    --vllm_gpu_memory_utilization 0.25 \
    --vllm_tensor_parallel_size 2 \
    --vllm_max_model_len 8192 \
    --max_completion_length 4096 \
    --epsilon 3e-4 \
    --epsilon_high 4e-4 \
    --lr 3e-6 \
    --min_lr 1e-8 \
    --bf16 true \
    --beta 0.00 \
    --importance_sampling_level sequence \
    --dynamic_sample false \
    --overlong_filter true \
    --sleep_level 2 \
    --offload_model false \
    --offload_optimizer true \
    --optimizer_cpu_offload false \
    --use_precision_aware_optimizer \
    --logging_steps 1 \
    --recompute_granularity full \
    --recompute_method uniform \
    --recompute_num_layers 1 \
    --finetune  \
    --dataloader_num_workers 8 \
    --dataset_num_proc 8 \
    --no_save_optim  \
    --no_save_rng  \
    --attention_backend flash \
    --temperature 1.0 \
    --padding_free false \
    --sequence_parallel false \
    --log_completions true --report_to swanlab

### Additional Information / 补充信息

_No response_

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Qwen3.5 is hanging during GRPO training, getting stuck after step 105 (out of a total of 219 steps). #8506

Checklist / 检查清单

Bug Description / Bug 描述

How to Reproduce / 如何复现

Additional Information / 补充信息

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Qwen3.5 is hanging during GRPO training, getting stuck after step 105 (out of a total of 219 steps). #8506

Description

Checklist / 检查清单

Bug Description / Bug 描述

How to Reproduce / 如何复现

Additional Information / 补充信息

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions