From c17c78b973467cd91688eba28b6af49e54062980 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 18 Mar 2026 15:11:08 +0800 Subject: [PATCH 01/10] update config h and add glm4.7 mtp test --- autotest/config_h.yml | 268 ++++++++++++------ autotest/config_h_legacy.yml | 26 +- autotest/evaluate/test_api_evaluate.py | 10 +- autotest/tools/common_case_config.py | 26 ++ .../test_pipeline_chat_pytorch_llm.py | 10 + .../test_restful_chat_hf_pytorch_llm.py | 10 + autotest/utils/benchmark_utils.py | 26 +- autotest/utils/config_utils.py | 10 + autotest/utils/constant.py | 32 ++- autotest/utils/run_client_chat.py | 21 +- 10 files changed, 332 insertions(+), 107 deletions(-) diff --git a/autotest/config_h.yml b/autotest/config_h.yml index 667033f36c..b0ed3a8f87 100644 --- a/autotest/config_h.yml +++ b/autotest/config_h.yml @@ -12,33 +12,34 @@ device: cuda config: tp: - Qwen/Qwen3-235B-A22B-FP8: 4 - internlm/Intern-S1: 4 Qwen/Qwen3-235B-A22B-Thinking-2507-FP8: 4 Qwen/Qwen3-30B-A3B: 2 - Qwen/Qwen3-32B: 2 openai/gpt-oss-120b: 2 - openai/gpt-oss-120b-BF16: 4 - openai/gpt-oss-20b-BF16: 2 - deepseek/DeepSeek-V3.1: 8 + openai/gpt-oss-20b: 2 + unsloth/gpt-oss-20b-BF16: 2 + deepseek-ai/DeepSeek-V3.1: 8 Qwen/Qwen3-30B-A3B-Base: 2 - JetLM/SDAR-30B-A3B-Sci: 2 - moonshotai/Kimi-K2-Instruct-0905: 16 Qwen/Qwen3-235B-A22B-Thinking-2507: 8 OpenGVLab/InternVL3_5-38B: 2 Qwen/Qwen3-VL-30B-A3B-Instruct: 2 - internlm/Intern-S1-Pro-FP8: 16 + zai-org/GLM-5: 16 + Qwen/Qwen3.5-27B: 2 + Qwen/Qwen3.5-35B-A3B: 2 + Qwen/Qwen3.5-122B-A10B: 4 + meta-llama/Llama-4-Scout-17B-16E-Instruct: 4 + meta-llama/Meta-Llama-3.1-70B-Instruct: 4 + OpenGVLab/InternVL3-38B: 2 + Qwen/Qwen2.5-VL-32B-Instruct: 2 + deepseek-ai/DeepSeek-V2-Lite-Chat: 2 + mistralai/Mixtral-8x7B-Instruct-v0.1: 2 + OpenGVLab/InternVL3_5-30B-A3B: 2 + zai-org/GLM-4.7-Flash: 2 + google/gemma-3-27b-it: 2 dp_ep: - moonshotai/Kimi-K2-Instruct-0905: - dp: 16 - ep: 16 Qwen/Qwen3-235B-A22B-Thinking-2507: dp: 8 ep: 8 - internlm/Intern-S1-Pro-FP8: - dp: 16 - ep: 16 cp_tp: Qwen/Qwen3-235B-A22B-Thinking-2507: @@ -48,64 +49,109 @@ config: turbomind_chat_model: tp: - - Qwen/Qwen3-0.6B-FP8 - - Qwen/Qwen3-1.7B-FP8 - - Qwen/Qwen3-4B-FP8 - - Qwen/Qwen3-8B-FP8 - - Qwen/Qwen3-14B-FP8 - - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 - - Qwen/Qwen3-235B-A22B-Thinking-2507 + - meta-llama/Llama-3.2-1B-Instruct + - meta-llama/Llama-3.2-3B-Instruct + - meta-llama/Meta-Llama-3.1-8B-Instruct + - meta-llama/Meta-Llama-3.1-70B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - internlm/internlm3-8b-instruct + - internlm/internlm3-8b-instruct-awq + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL3-38B + - OpenGVLab/InternVL3_5-30B-A3B + - Qwen/Qwen3-0.6B + - Qwen/Qwen3-4B + - Qwen/Qwen3-8B + - Qwen/Qwen3-32B-FP8 - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-30B-A3B-FP8 - - Qwen/Qwen3-32B - - Qwen/Qwen3-32B-FP8 + - Qwen/Qwen3-235B-A22B-Thinking-2507 + - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen2.5-VL-32B-Instruct + - Qwen/Qwen1.5-MoE-A2.7B-Chat + - mistralai/Mixtral-8x7B-Instruct-v0.1 - OpenGVLab/InternVL3_5-38B + - deepseek-ai/DeepSeek-V2-Lite-Chat + - THUDM/glm-4-9b-chat - openai/gpt-oss-120b - openai/gpt-oss-20b cp_tp: - Qwen/Qwen3-235B-A22B-Thinking-2507 + pytorch_chat_model: tp: - - Qwen/Qwen3-0.6B-FP8 - - Qwen/Qwen3-1.7B-FP8 - - Qwen/Qwen3-4B-FP8 - - Qwen/Qwen3-8B-FP8 - - Qwen/Qwen3-14B-FP8 - - Qwen/Qwen3-235B-A22B-Thinking-2507 - - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 + - meta-llama/Llama-4-Scout-17B-16E-Instruct + - meta-llama/Llama-3.2-1B-Instruct + - meta-llama/Llama-3.2-3B-Instruct + - meta-llama/Meta-Llama-3.1-8B-Instruct + - meta-llama/Meta-Llama-3.1-70B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL3-38B + - OpenGVLab/InternVL3_5-30B-A3B + - Qwen/Qwen3-0.6B + - Qwen/Qwen3-4B + - Qwen/Qwen3-8B + - Qwen/Qwen3-32B-FP8 - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-30B-A3B-FP8 - - Qwen/Qwen3-32B - - Qwen/Qwen3-32B-FP8 + - Qwen/Qwen3-235B-A22B-Thinking-2507 + - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 + - Qwen/Qwen3-VL-8B-Instruct - Qwen/Qwen3-VL-30B-A3B-Instruct - - OpenGVLab/InternVL3_5-38B - - unsloth/gpt-oss-120b-BF16 + - THUDM/cogvlm-chat-hf + - THUDM/cogvlm2-llama3-chinese-chat-19B + - THUDM/glm-4v-9b + - THUDM/glm-4-9b-chat + - zai-org/GLM-4.7-Flash + - microsoft/Phi-3.5-vision-instruct + - microsoft/Phi-3-vision-128k-instruct + - zai-org/GLM-5 + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-122B-A10B + - deepseek-ai/DeepSeek-V3.1 - unsloth/gpt-oss-20b-BF16 - - deepseek/DeepSeek-V3.1 - - moonshotai/Kimi-K2-Instruct-0905 - - internlm/Intern-S1-Pro-FP8 - - JetLM/SDAR-30B-A3B-Sci + - google/gemma-3-27b-it + - OpenGVLab/InternVL3_5-38B + dp_ep: - - moonshotai/Kimi-K2-Instruct-0905 - Qwen/Qwen3-235B-A22B-Thinking-2507 - - internlm/Intern-S1-Pro-FP8 turbomind_vl_model: tp: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL3-38B + - OpenGVLab/InternVL3_5-30B-A3B + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen2.5-VL-32B-Instruct - OpenGVLab/InternVL3_5-38B - pytorch_vl_model: tp: - - OpenGVLab/InternVL3_5-38B + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL3_5-30B-A3B + - Qwen/Qwen3-VL-8B-Instruct - Qwen/Qwen3-VL-30B-A3B-Instruct + - THUDM/cogvlm-chat-hf + - THUDM/cogvlm2-llama3-chinese-chat-19B + - THUDM/glm-4v-9b + - microsoft/Phi-3-vision-128k-instruct + - microsoft/Phi-3.5-vision-instruct + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-122B-A10B + - OpenGVLab/InternVL3_5-38B + turbomind_base_model: tp: - - Qwen/Qwen3-4B-FP8 - - openai/gpt-oss-20b + - Qwen/Qwen3-8B-Base + - Qwen/Qwen3-30B-A3B-Base pytorch_base_model: tp: @@ -114,94 +160,134 @@ pytorch_base_model: turbomind_quantization: no_awq: - - Qwen/Qwen3-0.6B-FP8 - - Qwen/Qwen3-1.7B-FP8 - - Qwen/Qwen3-4B-FP8 - - Qwen/Qwen3-8B-FP8 - - Qwen/Qwen3-14B-FP8 - - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 + - meta-llama/Meta-Llama-3.1-70B-Instruct + - internlm/internlm3-8b-instruct - Qwen/Qwen3-30B-A3B - - Qwen/Qwen3-30B-A3B-FP8 - - Qwen/Qwen3-32B + - Qwen/Qwen3-235B-A22B-Thinking-2507 + - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 - Qwen/Qwen3-32B-FP8 + - Qwen/Qwen3-30B-A3B-FP8 + - Qwen/Qwen3-30B-A3B-Base + - Qwen/Qwen1.5-MoE-A2.7B-Chat + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen2.5-VL-32B-Instruct + - OpenGVLab/InternVL3_5-30B-A3B + - deepseek-ai/DeepSeek-V2-Lite-Chat - openai/gpt-oss-120b - openai/gpt-oss-20b - - Qwen/Qwen3-235B-A22B-Thinking-2507 + - microsoft/Phi-3-mini-4k-instruct + - THUDM/glm-4v-9b + - THUDM/glm-4-9b-chat + gptq: - empty no_kvint4: - - Qwen/Qwen3-0.6B-FP8 - - Qwen/Qwen3-1.7B-FP8 - - Qwen/Qwen3-4B-FP8 - - Qwen/Qwen3-8B-FP8 - - Qwen/Qwen3-14B-FP8 - - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 + - meta-llama/Llama-3.2-1B-Instruct + - OpenGVLab/InternVL3-2B + - OpenGVLab/InternVL3-8B + - Qwen/Qwen3-0.6B + - Qwen/Qwen3-4B + - Qwen/Qwen3-8B + - Qwen/Qwen3-32B-FP8 - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-30B-A3B-FP8 - - Qwen/Qwen3-32B - - Qwen/Qwen3-32B-FP8 + - Qwen/Qwen3-235B-A22B-Thinking-2507 + - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen2.5-VL-32B-Instruct + - Qwen/Qwen1.5-MoE-A2.7B-Chat + - Qwen/Qwen3-8B-Base + - Qwen/Qwen3-30B-A3B-Base + - deepseek-ai/DeepSeek-V2-Lite-Chat - openai/gpt-oss-120b - openai/gpt-oss-20b - - Qwen/Qwen3-235B-A22B-Thinking-2507 no_kvint8: - - Qwen/Qwen3-235B-A22B-Thinking-2507 + - deepseek-ai/DeepSeek-V2-Chat + - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 pytorch_quantization: awq: - - empty + - meta-llama/Llama-3.2-3B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3.1-8B-Instruct + - internlm/internlm3-8b-instruct + - Qwen/Qwen3-0.6B + - Qwen/Qwen3-4B + - Qwen/Qwen3-8B w8a8: - - empty + - meta-llama/Llama-3.2-1B-Instruct + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3.1-8B-Instruct + - internlm/internlm3-8b-instruct + - microsoft/Phi-3-mini-4k-instruct no_kvint4: + - meta-llama/Llama-3.2-1B-Instruct + - OpenGVLab/InternVL3-2B + - OpenGVLab/InternVL3-8B - Qwen/Qwen3-8B-Base - - Qwen/Qwen3-0.6B-FP8 - - Qwen/Qwen3-1.7B-FP8 - - Qwen/Qwen3-4B-FP8 - - Qwen/Qwen3-8B-FP8 - - Qwen/Qwen3-14B-FP8 - - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 + - Qwen/Qwen3-30B-A3B-Base + - Qwen/Qwen3-0.6B + - Qwen/Qwen3-4B + - Qwen/Qwen3-8B + - Qwen/Qwen3-32B-FP8 - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-30B-A3B-FP8 - - Qwen/Qwen3-32B - - Qwen/Qwen3-32B-FP8 - - moonshotai/Kimi-K2-Instruct-0905 - Qwen/Qwen3-235B-A22B-Thinking-2507 - - internlm/Intern-S1-Pro-FP8 - - JetLM/SDAR-30B-A3B-Sci - - deepseek/DeepSeek-V3.1 + - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 + - Qwen/Qwen3-VL-8B-Instruct + - Qwen/Qwen3-VL-30B-A3B-Instruct + - microsoft/Phi-3-vision-128k-instruct + - microsoft/Phi-3.5-vision-instruct + - zai-org/GLM-4.7-Flash + - zai-org/GLM-5 + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-122B-A10B + - deepseek-ai/DeepSeek-V3.1 no_kvint8: - - Qwen/Qwen3-235B-A22B-Thinking-2507 - - internlm/Intern-S1-Pro-FP8 - - deepseek/DeepSeek-V3.1 + - zai-org/GLM-4.7-Flash + - zai-org/GLM-5 + - deepseek-ai/DeepSeek-V3.1 + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-122B-A10B + - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 longtext_model: - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B-Thinking-2507 + - zai-org/GLM-5 + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-122B-A10B benchmark_model: - - meta-llama/Meta-Llama-3-1-8B-Instruct - - meta-llama/Meta-Llama-3-1-70B-Instruct - - Qwen/Qwen3-32B + - meta-llama/Meta-Llama-3.1-8B-Instruct + - meta-llama/Meta-Llama-3.1-70B-Instruct - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B-Thinking-2507 - Qwen/Qwen2.5-72B-Instruct - openai/gpt-oss-120b - openai/gpt-oss-20b - unsloth/gpt-oss-20b-BF16 - - unsloth/gpt-oss-120b-BF16 + - zai-org/GLM-5 + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-122B-A10B + - google/gemma-3-27b-it evaluate_model: - - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-30B-A3B-FP8 - Qwen/Qwen3-235B-A22B-Thinking-2507 - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 - openai/gpt-oss-120b - - unsloth/gpt-oss-120b-BF16 - - deepseek/DeepSeek-V3.1 - - moonshotai/Kimi-K2-Instruct-0905 - - internlm/Intern-S1-Pro-FP8 - - JetLM/SDAR-30B-A3B-Sci + - deepseek-ai/DeepSeek-V3.1 + - zai-org/GLM-5 + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-122B-A10B mllm_evaluate_model: - OpenGVLab/InternVL3_5-38B diff --git a/autotest/config_h_legacy.yml b/autotest/config_h_legacy.yml index 02c9f9fcc6..a9d922aad6 100644 --- a/autotest/config_h_legacy.yml +++ b/autotest/config_h_legacy.yml @@ -5,14 +5,25 @@ server_log_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autote eval_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/evaluation_report mllm_eval_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/mllm_evaluation_report benchmark_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/benchmark_report -dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/ShareGPT_V3_unfiltered_cleaned_split.json -prefix_dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/prefix_cache_test.json +dataset_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +prefix_dataset_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/datasets/prefix_cache_test.json env_tag: h device: cuda config: tp: internlm/Intern-S1: 4 + internlm/Intern-S1-Pro-FP8: 16 + JetLM/SDAR-30B-A3B-Sci: 2 + moonshotai/Kimi-K2-Instruct-0905: 16 + + dp_ep: + internlm/Intern-S1-Pro-FP8: + dp: 16 + ep: 16 + moonshotai/Kimi-K2-Instruct-0905: + dp: 16 + ep: 16 turbomind_chat_model: tp: @@ -23,6 +34,11 @@ pytorch_chat_model: tp: - internlm/Intern-S1 - internlm/Intern-S1-mini + - JetLM/SDAR-30B-A3B-Sci + - moonshotai/Kimi-K2-Instruct-0905 + + dp_ep: + - moonshotai/Kimi-K2-Instruct-0905 turbomind_vl_model: tp: @@ -60,6 +76,7 @@ pytorch_quantization: no_kvint4: - internlm/Intern-S1 - internlm/Intern-S1-mini + - JetLM/SDAR-30B-A3B-Sci no_kvint8: - empty @@ -67,6 +84,11 @@ benchmark_model: - internlm/Intern-S1 - internlm/Intern-S1-mini +evaluate_model: + - internlm/Intern-S1-Pro-FP8 + - JetLM/SDAR-30B-A3B-Sci + - moonshotai/Kimi-K2-Instruct-0905 + mllm_evaluate_model: - internlm/Intern-S1 - internlm/Intern-S1-mini diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py index dda08d722c..785b79f668 100644 --- a/autotest/evaluate/test_api_evaluate.py +++ b/autotest/evaluate/test_api_evaluate.py @@ -23,6 +23,8 @@ def _run_ray_distributed_test( eval_config_name = 'gpt' elif 'intern-s1-pro' in run_config.get('model', '').lower(): eval_config_name = 'intern-s1-pro' + elif 'qwen3.5' in run_config.get('model', '').lower(): + eval_config_name = 'qwen3.5' if str(config.get('env_tag')) == 'ascend': eval_config_name = f'{eval_config_name}-2batch' @@ -68,6 +70,8 @@ def _run_proxy_distributed_test(config, eval_config_name = 'gpt' elif 'intern-s1-pro' in run_config.get('model', '').lower(): eval_config_name = 'intern-s1-pro' + elif 'qwen3.5' in run_config.get('model', '').lower(): + eval_config_name = 'qwen3.5' if str(config.get('env_tag')) == 'ascend': eval_config_name = f'{eval_config_name}-2batch' @@ -116,6 +120,8 @@ def run_eval_test(config, run_config, worker_id, test_type='infer', eval_config_ eval_config_name = 'sdar' elif 'intern-s1-pro' in run_config.get('model', '').lower(): eval_config_name = 'intern-s1-pro' + elif 'qwen3.5' in run_config.get('model', '').lower(): + eval_config_name = 'qwen3.5' if str(config.get('env_tag')) == 'a100': eval_config_name = f'{eval_config_name}-32k' elif str(config.get('env_tag')) == 'ascend': @@ -231,7 +237,7 @@ def test_turbomind_infer_tp8(config, run_config, worker_id): @pytest.mark.infer @pytest.mark.turbomind -@pytest.mark.gpu_num_cp2tp8 +@pytest.mark.gpu_num_distributed_cp2tp8 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('run_config', get_models('turbomind', {'cp': 2, 'tp': 8})) def test_turbomind_infer_cp2tp8(config, run_config, worker_id): @@ -442,7 +448,7 @@ def test_pytorch_eval_distributed_dpep16(config, run_config, worker_id): @pytest.mark.eval @pytest.mark.turbomind -@pytest.mark.gpu_num_cp2tp8 +@pytest.mark.gpu_num_distributed_cp2tp8 @pytest.mark.flaky(reruns=0) @pytest.mark.parametrize('run_config', get_models('turbomind', {'cp': 2, 'tp': 8})) def test_turbomind_eval_cp2tp8(config, run_config, worker_id): diff --git a/autotest/tools/common_case_config.py b/autotest/tools/common_case_config.py index 12334e8815..3eac291a8f 100644 --- a/autotest/tools/common_case_config.py +++ b/autotest/tools/common_case_config.py @@ -349,6 +349,20 @@ 'model': 'yuhuili/EAGLE3-LLaMA3.1-Instruct-8B' } } +}, { + 'model': 'zai-org/GLM-4.7-Flash', + 'communicator': 'nccl', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 2 + }, + 'extra_params': { + 'max_batch_size': 128, + 'speculative_config': { + 'method': 'deepseek_mtp', + 'num_speculative_tokens': 3 + } + } }] SPECULATIVE_DECODING_PIPELINE_TEST_LLM = [{ @@ -380,6 +394,18 @@ 'speculative-num-draft-tokens': 3, 'max-batch-size': 128 } +}, { + 'model': 'zai-org/GLM-4.7-Flash', + 'communicator': 'nccl', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 2 + }, + 'extra_params': { + 'speculative-algorithm': 'deepseek_mtp', + 'speculative-num-draft-tokens': 3, + 'max-batch-size': 128 + } }] SPECULATIVE_DECODING_RESTFUL_TEST_LLM = [{ diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py index bc41a8156c..6ee27bdd8c 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py @@ -105,3 +105,13 @@ def test_pytorch_chat_with_lora_tp2(config, run_config, common_case_config, work def test_pipeline_chat_speculative_decoding_tp1(config, run_config, common_case_config, worker_id): case_config = {k: v for k, v in common_case_config.items() if k == 'memory_test'} run_pipeline_llm_test(config, run_config, case_config, worker_id) + + +@pytest.mark.usefixtures('common_case_config') +@pytest.mark.flaky(reruns=0) +@pytest.mark.gpu_num_2 +@pytest.mark.parametrize( + 'run_config', [item for item in SPECULATIVE_DECODING_PIPELINE_TEST_LLM if item['parallel_config'].get('tp') == 2]) +def test_pipeline_chat_speculative_decoding_tp2(config, run_config, common_case_config, worker_id): + case_config = {k: v for k, v in common_case_config.items() if k == 'memory_test'} + run_pipeline_llm_test(config, run_config, case_config, worker_id) diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py index c27822eb47..70be8056e9 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py @@ -239,6 +239,16 @@ def test_restful_chat_speculative_decoding_tp1(config, run_config, common_case_c run_llm_test(config, run_config, case_config, worker_id) +@pytest.mark.usefixtures('common_case_config') +@pytest.mark.flaky(reruns=0) +@pytest.mark.gpu_num_2 +@pytest.mark.parametrize( + 'run_config', [item for item in SPECULATIVE_DECODING_RESTFUL_TEST_LLM if item['parallel_config'].get('tp') == 2]) +def test_restful_chat_speculative_decoding_tp2(config, run_config, common_case_config, worker_id): + case_config = {k: v for k, v in common_case_config.items() if k == 'memory_test'} + run_llm_test(config, run_config, case_config, worker_id) + + @pytest.mark.usefixtures('common_case_config') @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_distributed_tp16 diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index 12a8979025..0185492c27 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -1,3 +1,4 @@ +import copy import os import time @@ -7,6 +8,8 @@ from utils.config_utils import get_case_str_by_config, get_cli_common_param, get_cuda_prefix_by_workerid, get_workerid from utils.run_restful_chat import health_check, start_openai_service, terminate_restful_api +SERVE_ONLY_PARAMS = {'max-batch-size', 'max-prefill-token-num', 'server-name', 'enable-prefix-caching'} + def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = False): model = run_config.get('model') @@ -26,7 +29,12 @@ def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = Fa cuda_prefix = get_cuda_prefix_by_workerid(worker_id, run_config.get('parallel_config')) - command = f'{cuda_prefix} python3 benchmark/profile_throughput.py {dataset_path} {model_path} {get_cli_common_param(run_config)}' # noqa + bench_config = copy.deepcopy(run_config) + bench_config['extra_params'] = { + k: v + for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS + } + command = f'{cuda_prefix} python3 benchmark/profile_throughput.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}' # noqa if is_smoke: num_prompts = '--num-prompts 100' @@ -72,7 +80,12 @@ def longtext_throughput_test(config, run_config, worker_id: str = ''): cuda_prefix = get_cuda_prefix_by_workerid(worker_id, run_config.get('parallel_config')) - command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(run_config)}' # noqa + bench_config = copy.deepcopy(run_config) + bench_config['extra_params'] = { + k: v + for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS + } + command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}' # noqa env = os.environ.copy() env.update(run_config.get('env', {})) @@ -210,10 +223,11 @@ def prefixcache_throughput_test(config, run_config, worker_id: str = '', is_smok cuda_prefix = get_cuda_prefix_by_workerid(worker_id, run_config.get('parallel_config')) - run_config_new = run_config.copy() - if 'extra_params' not in run_config_new: - run_config_new['extra_params'] = {} - run_config_new['extra_params'].pop('enable-prefix-caching', None) + run_config_new = copy.deepcopy(run_config) + run_config_new['extra_params'] = { + k: v + for k, v in run_config_new.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS + } run_config_new['extra_params']['session-len'] = 32768 command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(run_config_new)}' # noqa diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 3d71fe1e0d..16f79dc070 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -102,6 +102,13 @@ def get_func_config_list(backend: str, run_config['extra_params']['cache-max-entry-count'] = 0.9 run_config['extra_params']['max-batch-size'] = 1024 + if 'GLM-5' in run_config['model']: + run_config['extra_params']['cache-max-entry-count'] = 0.9 + run_config['extra_params']['max-batch-size'] = 128 + + if 'Qwen3.5' in run_config['model']: + run_config['extra_params']['session-len'] = 128000 + if config.get('env_tag', '') in ['3090', '5080']: run_config['extra_params']['cache-max-entry-count'] = 0.5 @@ -128,6 +135,9 @@ def get_func_config_list(backend: str, run_config['extra_params']['max-prefill-token-num'] = 1024 run_config['extra_params']['max-batch-size'] = 128 + if 'openai/gpt-oss' in run_config['model']: + run_config['extra_params']['model-format'] = 'mxfp4' + return run_configs diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py index 153d3220e7..28d5933963 100644 --- a/autotest/utils/constant.py +++ b/autotest/utils/constant.py @@ -136,7 +136,37 @@ 'top_k': 50, 'min_p': 0.0, } - } + }, + 'qwen3.5': { + 'query_per_second': 4, + 'max_out_len': 128000, + 'max_seq_len': 128000, + 'batch_size': 500, + 'temperature': 1.0, + 'openai_extra_kwargs': { + 'top_p': 0.95, + 'presence_penalty': 1.5, + }, + 'extra_body': { + 'top_k': 20, + 'min_p': 0.0, + } + }, + 'qwen3.5-2batch': { + 'query_per_second': 4, + 'max_out_len': 128000, + 'max_seq_len': 128000, + 'batch_size': 2, + 'temperature': 1.0, + 'openai_extra_kwargs': { + 'top_p': 0.95, + 'presence_penalty': 1.5, + }, + 'extra_body': { + 'top_k': 20, + 'min_p': 0.0, + } + }, } MLLM_EVAL_CONFIGS = { diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py index d02a758073..b5a57cdea1 100644 --- a/autotest/utils/run_client_chat.py +++ b/autotest/utils/run_client_chat.py @@ -1,3 +1,4 @@ +import copy import os import time from subprocess import PIPE, Popen @@ -8,6 +9,11 @@ TEMPLATE = 'autotest/template.json' +CHAT_EXCLUDED_PARAMS = { + 'max-batch-size', 'cache-max-entry-count', 'max-prefill-token-num', 'server-name', 'enable-prefix-caching', + 'dllm-block-length', 'dllm-denoising-steps', 'dllm-confidence-threshold' +} + def run_tests(config, usercase, cli_case_config, run_config, worker_id): if 'coder' in run_config['model'].lower() and usercase == 'chat_testcase': @@ -28,14 +34,19 @@ def hf_command_line_test(config, case, case_info, run_config, cuda_prefix: str = else: model_path = os.path.join(config.get('model_path'), model) - run_config['extra_params']['session_len'] = 4096 + chat_config = copy.deepcopy(run_config) + chat_config['extra_params'] = { + k: v + for k, v in chat_config.get('extra_params', {}).items() if k not in CHAT_EXCLUDED_PARAMS + } + chat_config['extra_params']['session_len'] = 4096 if case == 'base_testcase': - run_config['extra_params']['chat_template'] = TEMPLATE - run_config['extra_params']['session_len'] = 512 + chat_config['extra_params']['chat_template'] = TEMPLATE + chat_config['extra_params']['session_len'] = 512 - print(run_config) + print(chat_config) - cmd = ' '.join([cuda_prefix, ' '.join(['lmdeploy chat', model_path, get_cli_common_param(run_config)])]).strip() + cmd = ' '.join([cuda_prefix, ' '.join(['lmdeploy chat', model_path, get_cli_common_param(chat_config)])]).strip() result, chat_log, msg = command_test(config, cmd, run_config, case_info, True) if chat_log: From c9d557fe31a7137e42aa2ef3d9ea7ef6a6e37000 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 18 Mar 2026 16:10:43 +0800 Subject: [PATCH 02/10] update config --- autotest/config_ascend.yml | 2 +- autotest/config_h.yml | 2 +- autotest/config_test.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/autotest/config_ascend.yml b/autotest/config_ascend.yml index 55a5e7728a..6087805f03 100644 --- a/autotest/config_ascend.yml +++ b/autotest/config_ascend.yml @@ -82,7 +82,7 @@ pytorch_quantization: - Qwen/Qwen3-VL-8B-Instruct - Qwen/Qwen3-VL-32B-Instruct -longtext_model: +longtext_benchmark_model: - Qwen/Qwen3-30B-A3B benchmark_model: diff --git a/autotest/config_h.yml b/autotest/config_h.yml index b0ed3a8f87..df2f88f304 100644 --- a/autotest/config_h.yml +++ b/autotest/config_h.yml @@ -253,7 +253,7 @@ pytorch_quantization: - Qwen/Qwen3.5-122B-A10B - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 -longtext_model: +longtext_benchmark_model: - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B-Thinking-2507 - zai-org/GLM-5 diff --git a/autotest/config_test.yml b/autotest/config_test.yml index 2ac9d56bef..70c3ab4d37 100644 --- a/autotest/config_test.yml +++ b/autotest/config_test.yml @@ -167,7 +167,7 @@ pytorch_quantization: - test/test_vl_tp1 - test/test_vl_dpep8 -longtext_model: +longtext_benchmark_model: - test/test_tp1 - test/test_tp1_pytorch - test/test_vl_tp2 From 92fba62468f0a77edef75fe89d9998aa5d8edd3b Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Wed, 18 Mar 2026 17:12:16 +0800 Subject: [PATCH 03/10] fix benchmark test --- autotest/utils/benchmark_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index 0185492c27..c8151b1359 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -8,7 +8,7 @@ from utils.config_utils import get_case_str_by_config, get_cli_common_param, get_cuda_prefix_by_workerid, get_workerid from utils.run_restful_chat import health_check, start_openai_service, terminate_restful_api -SERVE_ONLY_PARAMS = {'max-batch-size', 'max-prefill-token-num', 'server-name', 'enable-prefix-caching'} +SERVE_ONLY_PARAMS = {'max-batch-size', 'max-prefill-token-num', 'server-name', 'enable-prefix-caching', 'session-len'} # yapf: disable def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = False): From 5d7f415760090cbfb227e4fd21f53233b0309852 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 19 Mar 2026 15:57:38 +0800 Subject: [PATCH 04/10] fix gpt-oss args --- autotest/utils/benchmark_utils.py | 7 ++++++- autotest/utils/config_utils.py | 3 --- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index c8151b1359..0c8d3a5789 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -8,7 +8,10 @@ from utils.config_utils import get_case_str_by_config, get_cli_common_param, get_cuda_prefix_by_workerid, get_workerid from utils.run_restful_chat import health_check, start_openai_service, terminate_restful_api -SERVE_ONLY_PARAMS = {'max-batch-size', 'max-prefill-token-num', 'server-name', 'enable-prefix-caching', 'session-len'} # yapf: disable +SERVE_ONLY_PARAMS = { # yapf: disable + 'max-batch-size', 'max-prefill-token-num', 'server-name', + 'enable-prefix-caching', 'session-len', +} def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = False): @@ -34,6 +37,8 @@ def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = Fa k: v for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS } + if 'openai/gpt-oss' in run_config.get('model', ''): + bench_config['extra_params']['model-format'] = 'mxfp4' command = f'{cuda_prefix} python3 benchmark/profile_throughput.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}' # noqa if is_smoke: diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 16f79dc070..bbe375f987 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -135,9 +135,6 @@ def get_func_config_list(backend: str, run_config['extra_params']['max-prefill-token-num'] = 1024 run_config['extra_params']['max-batch-size'] = 128 - if 'openai/gpt-oss' in run_config['model']: - run_config['extra_params']['model-format'] = 'mxfp4' - return run_configs From 11424c7c2a4d0761c2de3efbf05f7a4bce52e1b3 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 19 Mar 2026 16:22:44 +0800 Subject: [PATCH 05/10] update --- autotest/utils/benchmark_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index 0c8d3a5789..c425ddafa2 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -37,7 +37,7 @@ def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = Fa k: v for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS } - if 'openai/gpt-oss' in run_config.get('model', ''): + if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind': bench_config['extra_params']['model-format'] = 'mxfp4' command = f'{cuda_prefix} python3 benchmark/profile_throughput.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}' # noqa @@ -90,6 +90,8 @@ def longtext_throughput_test(config, run_config, worker_id: str = ''): k: v for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS } + if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind': + bench_config['extra_params']['model-format'] = 'mxfp4' command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}' # noqa env = os.environ.copy() @@ -234,6 +236,8 @@ def prefixcache_throughput_test(config, run_config, worker_id: str = '', is_smok for k, v in run_config_new.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS } run_config_new['extra_params']['session-len'] = 32768 + if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind': + run_config_new['extra_params']['model-format'] = 'mxfp4' command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(run_config_new)}' # noqa env = os.environ.copy() From 8671605c3696688dd049421857f4e7f421045e37 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 19 Mar 2026 16:38:45 +0800 Subject: [PATCH 06/10] update vl test config --- autotest/config_h.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/autotest/config_h.yml b/autotest/config_h.yml index df2f88f304..b1d6d8cc3e 100644 --- a/autotest/config_h.yml +++ b/autotest/config_h.yml @@ -146,6 +146,9 @@ pytorch_vl_model: - Qwen/Qwen3.5-35B-A3B - Qwen/Qwen3.5-122B-A10B - OpenGVLab/InternVL3_5-38B + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-122B-A10B turbomind_base_model: @@ -292,3 +295,6 @@ evaluate_model: mllm_evaluate_model: - OpenGVLab/InternVL3_5-38B - Qwen/Qwen3-VL-30B-A3B-Instruct + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-122B-A10B From a2f1446a0b4fa7905cfea909e6d5c2be24a39e73 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Thu, 19 Mar 2026 17:02:19 +0800 Subject: [PATCH 07/10] update bencmark test --- autotest/utils/benchmark_utils.py | 6 ------ autotest/utils/config_utils.py | 4 ++++ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index c425ddafa2..fffa79a40b 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -37,8 +37,6 @@ def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = Fa k: v for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS } - if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind': - bench_config['extra_params']['model-format'] = 'mxfp4' command = f'{cuda_prefix} python3 benchmark/profile_throughput.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}' # noqa if is_smoke: @@ -90,8 +88,6 @@ def longtext_throughput_test(config, run_config, worker_id: str = ''): k: v for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS } - if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind': - bench_config['extra_params']['model-format'] = 'mxfp4' command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}' # noqa env = os.environ.copy() @@ -236,8 +232,6 @@ def prefixcache_throughput_test(config, run_config, worker_id: str = '', is_smok for k, v in run_config_new.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS } run_config_new['extra_params']['session-len'] = 32768 - if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind': - run_config_new['extra_params']['model-format'] = 'mxfp4' command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(run_config_new)}' # noqa env = os.environ.copy() diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index bbe375f987..b6283ddda7 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -135,6 +135,10 @@ def get_func_config_list(backend: str, run_config['extra_params']['max-prefill-token-num'] = 1024 run_config['extra_params']['max-batch-size'] = 128 + if ('openai/gpt-oss' in run_config['model'] and backend == 'turbomind' + and func_type in ('benchmark', 'longtext_benchmark')): + run_config['extra_params']['model-format'] = 'mxfp4' + return run_configs From c6efe39f10cf5cc682c560cdee7add7ab5c49f0c Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Mon, 13 Apr 2026 16:44:44 +0800 Subject: [PATCH 08/10] update config h and add qwen3.5 mtp --- autotest/config_ascend.yml | 13 +- autotest/config_h.yml | 63 +++++- autotest/config_h_legacy.yml | 36 +--- autotest/evaluate/eval_config_chat.py | 11 +- .../evaluate/eval_config_chat_512_longtext.py | 117 +++++++++++ .../evaluate/eval_config_chat_longtext.py | 184 +++++++++++++++++ autotest/evaluate/test_api_evaluate.py | 189 ++++++++++++++++-- autotest/tools/common_case_config.py | 90 ++++++++- autotest/utils/config_utils.py | 6 +- autotest/utils/constant.py | 53 +++-- autotest/utils/evaluate_utils.py | 19 +- 11 files changed, 706 insertions(+), 75 deletions(-) create mode 100644 autotest/evaluate/eval_config_chat_512_longtext.py create mode 100644 autotest/evaluate/eval_config_chat_longtext.py diff --git a/autotest/config_ascend.yml b/autotest/config_ascend.yml index 6087805f03..29efc3826d 100644 --- a/autotest/config_ascend.yml +++ b/autotest/config_ascend.yml @@ -23,6 +23,12 @@ config: Qwen/Qwen3-VL-30B-A3B-Instruct: 4 Qwen/Qwen3-VL-8B-Instruct: 2 Qwen/Qwen3-VL-32B-Instruct: 4 + internlm/Intern-S1-Pro-BF16: 64 + + dp_ep: + internlm/Intern-S1-Pro-BF16: + dp: 64 + ep: 64 pytorch_chat_model: tp: @@ -31,6 +37,10 @@ pytorch_chat_model: - Qwen/Qwen3-32B - Qwen/Qwen3-8B - Qwen/Qwen3-0.6B + - internlm/Intern-S1-Pro-BF16 + + dp_ep: + - internlm/Intern-S1-Pro-BF16 pytorch_vl_model: tp: @@ -43,7 +53,6 @@ pytorch_vl_model: - Qwen/Qwen3-VL-8B-Instruct - Qwen/Qwen3-VL-32B-Instruct - pytorch_base_model: tp: - Qwen/Qwen3-0.6B @@ -67,6 +76,7 @@ pytorch_quantization: - Qwen/Qwen3-VL-30B-A3B-Instruct - Qwen/Qwen3-VL-8B-Instruct - Qwen/Qwen3-VL-32B-Instruct + - internlm/Intern-S1-Pro-BF16 no_kvint8: - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B @@ -81,6 +91,7 @@ pytorch_quantization: - Qwen/Qwen3-VL-30B-A3B-Instruct - Qwen/Qwen3-VL-8B-Instruct - Qwen/Qwen3-VL-32B-Instruct + - internlm/Intern-S1-Pro-BF16 longtext_benchmark_model: - Qwen/Qwen3-30B-A3B diff --git a/autotest/config_h.yml b/autotest/config_h.yml index b1d6d8cc3e..675f1f6f4b 100644 --- a/autotest/config_h.yml +++ b/autotest/config_h.yml @@ -22,7 +22,7 @@ config: Qwen/Qwen3-235B-A22B-Thinking-2507: 8 OpenGVLab/InternVL3_5-38B: 2 Qwen/Qwen3-VL-30B-A3B-Instruct: 2 - zai-org/GLM-5: 16 + zai-org/GLM-5-FP8: 8 Qwen/Qwen3.5-27B: 2 Qwen/Qwen3.5-35B-A3B: 2 Qwen/Qwen3.5-122B-A10B: 4 @@ -35,11 +35,16 @@ config: OpenGVLab/InternVL3_5-30B-A3B: 2 zai-org/GLM-4.7-Flash: 2 google/gemma-3-27b-it: 2 + internlm/Intern-S1: 4 + internlm/Intern-S1-Pro-FP8: 16 dp_ep: Qwen/Qwen3-235B-A22B-Thinking-2507: dp: 8 ep: 8 + internlm/Intern-S1-Pro-FP8: + dp: 16 + ep: 16 cp_tp: Qwen/Qwen3-235B-A22B-Thinking-2507: @@ -76,6 +81,8 @@ turbomind_chat_model: - THUDM/glm-4-9b-chat - openai/gpt-oss-120b - openai/gpt-oss-20b + - internlm/Intern-S1 + - internlm/Intern-S1-mini cp_tp: - Qwen/Qwen3-235B-A22B-Thinking-2507 @@ -110,17 +117,22 @@ pytorch_chat_model: - zai-org/GLM-4.7-Flash - microsoft/Phi-3.5-vision-instruct - microsoft/Phi-3-vision-128k-instruct - - zai-org/GLM-5 + - zai-org/GLM-5-FP8 - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3.5-122B-A10B - deepseek-ai/DeepSeek-V3.1 - unsloth/gpt-oss-20b-BF16 - google/gemma-3-27b-it - OpenGVLab/InternVL3_5-38B + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - internlm/Intern-S1-Pro-FP8 dp_ep: - Qwen/Qwen3-235B-A22B-Thinking-2507 + - internlm/Intern-S1-Pro-FP8 turbomind_vl_model: tp: @@ -130,6 +142,8 @@ turbomind_vl_model: - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct - OpenGVLab/InternVL3_5-38B + - internlm/Intern-S1 + - internlm/Intern-S1-mini pytorch_vl_model: tp: @@ -144,11 +158,10 @@ pytorch_vl_model: - microsoft/Phi-3.5-vision-instruct - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3.5-122B-A10B - - OpenGVLab/InternVL3_5-38B - - Qwen/Qwen3.5-27B - - Qwen/Qwen3.5-35B-A3B - - Qwen/Qwen3.5-122B-A10B + - internlm/Intern-S1 + - internlm/Intern-S1-mini turbomind_base_model: @@ -170,6 +183,7 @@ turbomind_quantization: - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 - Qwen/Qwen3-32B-FP8 - Qwen/Qwen3-30B-A3B-FP8 + - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3-30B-A3B-Base - Qwen/Qwen1.5-MoE-A2.7B-Chat - Qwen/Qwen2.5-VL-7B-Instruct @@ -181,6 +195,8 @@ turbomind_quantization: - microsoft/Phi-3-mini-4k-instruct - THUDM/glm-4v-9b - THUDM/glm-4-9b-chat + - internlm/Intern-S1 + - internlm/Intern-S1-mini gptq: - empty @@ -194,6 +210,7 @@ turbomind_quantization: - Qwen/Qwen3-32B-FP8 - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-30B-A3B-FP8 + - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3-235B-A22B-Thinking-2507 - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 - Qwen/Qwen2.5-VL-7B-Instruct @@ -204,6 +221,8 @@ turbomind_quantization: - deepseek-ai/DeepSeek-V2-Lite-Chat - openai/gpt-oss-120b - openai/gpt-oss-20b + - internlm/Intern-S1 + - internlm/Intern-S1-mini no_kvint8: - deepseek-ai/DeepSeek-V2-Chat - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 @@ -235,6 +254,7 @@ pytorch_quantization: - Qwen/Qwen3-32B-FP8 - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-30B-A3B-FP8 + - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3-235B-A22B-Thinking-2507 - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 - Qwen/Qwen3-VL-8B-Instruct @@ -242,26 +262,32 @@ pytorch_quantization: - microsoft/Phi-3-vision-128k-instruct - microsoft/Phi-3.5-vision-instruct - zai-org/GLM-4.7-Flash - - zai-org/GLM-5 + - zai-org/GLM-5-FP8 - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B - Qwen/Qwen3.5-122B-A10B - deepseek-ai/DeepSeek-V3.1 + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - internlm/Intern-S1-Pro-FP8 no_kvint8: - zai-org/GLM-4.7-Flash - - zai-org/GLM-5 + - zai-org/GLM-5-FP8 - deepseek-ai/DeepSeek-V3.1 - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3.5-122B-A10B - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 + - internlm/Intern-S1-Pro-FP8 longtext_benchmark_model: - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B-Thinking-2507 - - zai-org/GLM-5 + - zai-org/GLM-5-FP8 - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3.5-122B-A10B benchmark_model: @@ -273,11 +299,14 @@ benchmark_model: - openai/gpt-oss-120b - openai/gpt-oss-20b - unsloth/gpt-oss-20b-BF16 - - zai-org/GLM-5 + - zai-org/GLM-5-FP8 - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3.5-122B-A10B - google/gemma-3-27b-it + - internlm/Intern-S1 + - internlm/Intern-S1-mini evaluate_model: - Qwen/Qwen3-32B-FP8 @@ -287,14 +316,26 @@ evaluate_model: - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 - openai/gpt-oss-120b - deepseek-ai/DeepSeek-V3.1 - - zai-org/GLM-5 + - zai-org/GLM-5-FP8 - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3.5-122B-A10B + - internlm/Intern-S1-Pro-FP8 + +longtext_evaluate_model: + - Qwen/Qwen3.5-35B-A3B + +mtp_evaluate_model: + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-35B-A3B-FP8 mllm_evaluate_model: - OpenGVLab/InternVL3_5-38B - Qwen/Qwen3-VL-30B-A3B-Instruct - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3.5-122B-A10B + - internlm/Intern-S1 + - internlm/Intern-S1-mini diff --git a/autotest/config_h_legacy.yml b/autotest/config_h_legacy.yml index a9d922aad6..c9f80a3f49 100644 --- a/autotest/config_h_legacy.yml +++ b/autotest/config_h_legacy.yml @@ -12,28 +12,20 @@ device: cuda config: tp: - internlm/Intern-S1: 4 - internlm/Intern-S1-Pro-FP8: 16 JetLM/SDAR-30B-A3B-Sci: 2 moonshotai/Kimi-K2-Instruct-0905: 16 dp_ep: - internlm/Intern-S1-Pro-FP8: - dp: 16 - ep: 16 moonshotai/Kimi-K2-Instruct-0905: dp: 16 ep: 16 turbomind_chat_model: tp: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - empty pytorch_chat_model: tp: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - JetLM/SDAR-30B-A3B-Sci - moonshotai/Kimi-K2-Instruct-0905 @@ -42,29 +34,27 @@ pytorch_chat_model: turbomind_vl_model: tp: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - empty pytorch_vl_model: tp: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - empty turbomind_base_model: tp: + - empty pytorch_base_model: tp: + - empty turbomind_quantization: no_awq: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - empty gptq: - empty no_kvint4: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - empty no_kvint8: - empty @@ -74,21 +64,17 @@ pytorch_quantization: w8a8: - empty no_kvint4: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - JetLM/SDAR-30B-A3B-Sci + - moonshotai/Kimi-K2-Instruct-0905 no_kvint8: - - empty + - moonshotai/Kimi-K2-Instruct-0905 benchmark_model: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - empty evaluate_model: - - internlm/Intern-S1-Pro-FP8 - JetLM/SDAR-30B-A3B-Sci - moonshotai/Kimi-K2-Instruct-0905 mllm_evaluate_model: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - empty diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py index 0b29046dcf..76dbfc6618 100644 --- a/autotest/evaluate/eval_config_chat.py +++ b/autotest/evaluate/eval_config_chat.py @@ -4,7 +4,7 @@ from opencompass.models import OpenAISDK from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner from opencompass.runners import LocalRunner -from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask +from opencompass.tasks import OpenICLEvalTask, OpenICLInferConcurrentTask from opencompass.utils.text_postprocessors import extract_non_reasoning_content ####################################################################### @@ -48,6 +48,7 @@ run_cfg=dict(num_gpus=0), meta_template=api_meta_template, timeout=10800, + max_workers=1024, pred_postprocessor=dict(type=extract_non_reasoning_content)) ] @@ -128,15 +129,13 @@ if 'max_out_len' in item['infer_cfg']['inferencer']: del item['infer_cfg']['inferencer']['max_out_len'] -NUM_WORKERS = 8 - infer = dict( - partitioner=dict(type=NumWorkerPartitioner, num_worker=NUM_WORKERS), + partitioner=dict(type=NumWorkerPartitioner, num_worker=1), runner=dict( type=LocalRunner, max_num_workers=64, retry=0, - task=dict(type=OpenICLInferTask), + task=dict(type=OpenICLInferConcurrentTask), ), ) @@ -145,5 +144,3 @@ partitioner=dict(type=NaivePartitioner, n=10), runner=dict(type=LocalRunner, max_num_workers=64, task=dict(type=OpenICLEvalTask)), ) - -infer['partitioner']['num_worker'] = 64 diff --git a/autotest/evaluate/eval_config_chat_512_longtext.py b/autotest/evaluate/eval_config_chat_512_longtext.py new file mode 100644 index 0000000000..139fd1a655 --- /dev/null +++ b/autotest/evaluate/eval_config_chat_512_longtext.py @@ -0,0 +1,117 @@ +# flake8: noqa + +from mmengine.config import read_base +from opencompass.models import OpenAISDK +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLEvalTask, OpenICLInferConcurrentTask +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + from opencompass.configs.datasets.ruler.ruler_512k_gen import ( + ruler_datasets as ruler_512k_datasets, + ) + from opencompass.configs.summarizers.groups.ruler import ( + ruler_summary_groups as _ruler_summary_groups_all, + ) + +ruler_summary_groups = [ + g for g in _ruler_summary_groups_all if g.get('name') == 'ruler_512k' +] + +####################################################################### +# Model Configuration # +####################################################################### + +MODEL_NAME = '' +MODEL_PATH = '' +API_BASE = '' +JUDGE_MODEL_NAME = '' +JUDGE_MODEL_PATH = '' +JUDGE_API_BASE = '' + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + +# Use OpenAISDK to configure LMDeploy OpenAI interface +models = [ + dict(type=OpenAISDK, + abbr=f'{MODEL_NAME}', + path=MODEL_PATH, + key='EMPTY', + openai_api_base=API_BASE, + retry=3, + run_cfg=dict(num_gpus=0), + meta_template=api_meta_template, + timeout=10800, + max_workers=1024, + pred_postprocessor=dict(type=extract_non_reasoning_content)) +] + +####################################################################### +# PART 1 Datasets List # +####################################################################### +datasets = list(ruler_512k_datasets) + +judge_cfg = dict( + type=OpenAISDK, + abbr=f'{JUDGE_MODEL_NAME}', + path=JUDGE_MODEL_NAME, + key='EMPTY', + openai_api_base=JUDGE_API_BASE, + meta_template=dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ]), + query_per_second=16, + batch_size=1024, + temperature=0.001, + tokenizer_path=JUDGE_MODEL_PATH, + verbose=True, + max_out_len=8192, + max_seq_len=32768, + mode='mid', +) + +for item in datasets: + if 'judge_cfg' in item['eval_cfg']['evaluator']: + item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg + if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys( + ) and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']: + item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg + +####################################################################### +# PART 2 Dataset Summarizer # +####################################################################### + +summarizer = dict( + dataset_abbrs=[ + ['ruler_512k', 'naive_average'], + ], + summary_groups=ruler_summary_groups, +) + +for item in datasets: + if 'max_out_len' in item['infer_cfg']['inferencer']: + del item['infer_cfg']['inferencer']['max_out_len'] + +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=1), + runner=dict( + type=LocalRunner, + max_num_workers=64, + retry=0, + task=dict(type=OpenICLInferConcurrentTask), + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict(type=LocalRunner, max_num_workers=64, task=dict(type=OpenICLEvalTask)), +) diff --git a/autotest/evaluate/eval_config_chat_longtext.py b/autotest/evaluate/eval_config_chat_longtext.py new file mode 100644 index 0000000000..f9ba80e263 --- /dev/null +++ b/autotest/evaluate/eval_config_chat_longtext.py @@ -0,0 +1,184 @@ +# flake8: noqa + +from mmengine.config import read_base +from opencompass.models import OpenAISDK +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLEvalTask, OpenICLInferConcurrentTask +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets + from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import ( + needlebench_datasets as needlebench_8k_datasets, + ) + from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import ( + needlebench_datasets as needlebench_32k_datasets, + ) + from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import ( + needlebench_datasets as needlebench_128k_datasets, + ) + from opencompass.configs.datasets.ruler.ruler_8k_gen import ( + ruler_datasets as ruler_8k_datasets, + ) + from opencompass.configs.datasets.ruler.ruler_32k_gen import ( + ruler_datasets as ruler_32k_datasets, + ) + from opencompass.configs.datasets.ruler.ruler_64k_gen import ( + ruler_datasets as ruler_64k_datasets, + ) + from opencompass.configs.datasets.ruler.ruler_128k_gen import ( + ruler_datasets as ruler_128k_datasets, + ) + from opencompass.configs.datasets.ruler.ruler_256k_gen import ( + ruler_datasets as ruler_256k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_0k_gen import ( + babiLong_0k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_4k_gen import ( + babiLong_4k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_16k_gen import ( + babiLong_16k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_32k_gen import ( + babiLong_32k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_128k_gen import ( + babiLong_128k_datasets, + ) + from opencompass.configs.datasets.babilong.babilong_256k_gen import ( + babiLong_256k_datasets, + ) + # Summary Groups + from opencompass.configs.summarizers.groups.babilong import ( + babilong_summary_groups, + ) + from opencompass.configs.summarizers.groups.ruler import ( + ruler_summary_groups, + ) + from opencompass.configs.summarizers.needlebench import ( + needlebench_8k_summarizer, + needlebench_32k_summarizer, + needlebench_128k_summarizer, + ) + +ruler_summary_groups = [ + g for g in ruler_summary_groups if g.get('name') != 'ruler_512k' +] + +####################################################################### +# Model Configuration # +####################################################################### + +MODEL_NAME = '' +MODEL_PATH = '' +API_BASE = '' +JUDGE_MODEL_NAME = '' +JUDGE_MODEL_PATH = '' +JUDGE_API_BASE = '' + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) + +# Use OpenAISDK to configure LMDeploy OpenAI interface +models = [ + dict(type=OpenAISDK, + abbr=f'{MODEL_NAME}', + path=MODEL_PATH, + key='EMPTY', + openai_api_base=API_BASE, + retry=3, + run_cfg=dict(num_gpus=0), + meta_template=api_meta_template, + timeout=10800, + max_workers=1024, + pred_postprocessor=dict(type=extract_non_reasoning_content)) +] + +####################################################################### +# PART 1 Datasets List # +####################################################################### +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +needlebench_8k_summary_groups = needlebench_8k_summarizer["summary_groups"] +needlebench_32k_summary_groups = needlebench_32k_summarizer["summary_groups"] +needlebench_128k_summary_groups = needlebench_128k_summarizer["summary_groups"] + +# LLM judge config: using LLM to evaluate predictions +judge_cfg = dict( + type=OpenAISDK, + abbr=f'{JUDGE_MODEL_NAME}', + path=JUDGE_MODEL_NAME, + key='EMPTY', + openai_api_base=JUDGE_API_BASE, + meta_template=dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ]), + query_per_second=16, + batch_size=1024, + temperature=0.001, + tokenizer_path=JUDGE_MODEL_PATH, + verbose=True, + max_out_len=8192, + max_seq_len=32768, + mode='mid', +) + +for item in datasets: + if 'judge_cfg' in item['eval_cfg']['evaluator']: + item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg + if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys( + ) and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']: + item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg + +####################################################################### +# PART 2 Dataset Summarizer # +####################################################################### + +summarizer = dict( + dataset_abbrs=[ + ["ruler_8k", "naive_average"], + ["ruler_32k", "naive_average"], + ["ruler_64k", "naive_average"], + ["ruler_128k", "naive_average"], + ["ruler_256k", "naive_average"], + ["NeedleBench-Overall-Score-8K", "weighted_average"], + ["NeedleBench-Overall-Score-32K", "weighted_average"], + ["NeedleBench-Overall-Score-128K", "weighted_average"], + ['babilong_0k', 'naive_average'], + ['babilong_4k', 'naive_average'], + ['babilong_16k', 'naive_average'], + ['babilong_32k', 'naive_average'], + ['babilong_128k', 'naive_average'], + ['babilong_256k', 'naive_average'], + ], + summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + +for item in datasets: + if 'max_out_len' in item['infer_cfg']['inferencer']: + del item['infer_cfg']['inferencer']['max_out_len'] + +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=1), + runner=dict( + type=LocalRunner, + max_num_workers=64, + retry=0, + task=dict(type=OpenICLInferConcurrentTask), + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict(type=LocalRunner, max_num_workers=64, task=dict(type=OpenICLEvalTask)), +) diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py index 785b79f668..7877b29c04 100644 --- a/autotest/evaluate/test_api_evaluate.py +++ b/autotest/evaluate/test_api_evaluate.py @@ -1,3 +1,4 @@ +import copy import os import time @@ -112,26 +113,41 @@ def _run_proxy_distributed_test(config, time.sleep(1) -def run_eval_test(config, run_config, worker_id, test_type='infer', eval_config_name='default'): +def run_eval_test(config, run_config, worker_id, test_type='infer', eval_config_name='default', eval_subpath=None): """Run test with specified evaluation configuration.""" - if 'gpt' in run_config.get('model', '').lower(): - eval_config_name = 'gpt' - elif 'sdar' in run_config.get('model', '').lower(): - eval_config_name = 'sdar' - elif 'intern-s1-pro' in run_config.get('model', '').lower(): - eval_config_name = 'intern-s1-pro' - elif 'qwen3.5' in run_config.get('model', '').lower(): - eval_config_name = 'qwen3.5' - if str(config.get('env_tag')) == 'a100': - eval_config_name = f'{eval_config_name}-32k' - elif str(config.get('env_tag')) == 'ascend': - eval_config_name = f'{eval_config_name}-2batch' + if eval_config_name == 'default': + longtext_key = run_config.get('_longtext_eval_config_name') + if longtext_key: + eval_config_name = longtext_key + else: + if 'gpt' in run_config.get('model', '').lower(): + eval_config_name = 'gpt' + elif 'sdar' in run_config.get('model', '').lower(): + eval_config_name = 'sdar' + elif 'intern-s1-pro' in run_config.get('model', '').lower(): + eval_config_name = 'intern-s1-pro' + elif 'qwen3.5' in run_config.get('model', '').lower(): + eval_config_name = 'qwen3.5' + if str(config.get('env_tag')) == 'a100': + eval_config_name = f'{eval_config_name}-32k' + elif str(config.get('env_tag')) == 'ascend': + eval_config_name = f'{eval_config_name}-2batch' preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {}) eval_path = config.get('eval_path') + if eval_subpath: + rel = eval_subpath + nested = run_config.get('_eval_path_subdir') + if nested: + rel = os.path.join(rel, nested) + eval_path = os.path.join(eval_path, rel) + os.makedirs(eval_path, exist_ok=True) total_gpus = int(os.environ.get('TOTAL_GPU_COUNT', '8')) work_num = int(total_gpus / run_config.get('parallel_config', {}).get('tp', 1)) + + # Set max-num-workers to 8 for qwen3.5 models extra_config = {'max-num-workers': min(work_num * 16, 64)} + case_name = get_case_str_by_config(run_config) if test_type == 'infer': @@ -163,6 +179,7 @@ def run_openai_service_start(i): port=constant.PROXY_PORT, test_type=test_type, extra_config=extra_config, + eval_config_name=eval_config_name, **preset_config) finally: for i in range(work_num): @@ -186,6 +203,7 @@ def run_openai_service_start(i): port=port, test_type=test_type, extra_config=extra_config, + eval_config_name=eval_config_name, **preset_config) else: assert False, f'Failed to start RESTful API server: {content}' @@ -195,8 +213,73 @@ def run_openai_service_start(i): stop_restful_api(proxy_pid, proxy_process) -def get_models(backend, parallel_config): - return get_func_config_list(backend, parallel_config, func_type='evaluate', extra={'session_len': 65536}) +def get_models(backend, parallel_config, session_len='auto'): + if session_len == 'auto': + configs = get_func_config_list(backend, parallel_config, func_type='evaluate', extra={}) + result = [] + for config in configs: + model = config.get('model', '') + if 'Qwen3.5' not in model: + if 'extra_params' not in config: + config['extra_params'] = {} + config['extra_params']['session_len'] = 65536 + result.append(config) + return result + else: + extra = {'session_len': session_len} if session_len is not None else {} + return get_func_config_list(backend, parallel_config, func_type='evaluate', extra=extra) + + +def _resolve_longtext_eval_config_name(run_config: dict) -> str | None: + """Map longtext_evaluate config to EVAL_CONFIGS key; add branches when new + longtext families ship.""" + ep = run_config.get('extra_params') or {} + raw = ep.get('session_len', ep.get('session-len')) + if raw is None: + return None + try: + sl = int(raw) + except (TypeError, ValueError): + return None + model_lower = (run_config.get('model') or '').lower() + if 'qwen3.5' in model_lower: + if sl >= 600000: + return 'longtext-512k' + if sl >= 300000: + return 'longtext-256k' + return None + + +def get_longtext_models(backend, parallel_config, session_len='auto'): + if session_len == 'auto': + session_len = 65536 + extra = {'session_len': session_len} if session_len is not None else {} + configs = get_func_config_list(backend, parallel_config, func_type='longtext_evaluate', extra=extra) + for cfg in configs: + preset_key = _resolve_longtext_eval_config_name(cfg) + if preset_key: + cfg['_longtext_eval_config_name'] = preset_key + return configs + + +def get_mtp_models(backend, parallel_config): + base_configs = get_func_config_list(backend, parallel_config, func_type='mtp_evaluate', extra={}) + for cfg in base_configs: + if 'qwen3.5' in cfg.get('model', '').lower(): + cfg['extra_params'].update(constant.QWEN35_MTP_SERVER_EXTRA) + + result_configs = [] + for config in base_configs: + result_configs.append(config) + + if config.get('model') == 'Qwen/Qwen3.5-35B-A3B' and parallel_config.get('tp') == 2: + fp8_config = copy.deepcopy(config) + fp8_config['extra_params']['max-prefill-token-num'] = 1024 + fp8_config['extra_params']['model-format'] = 'fp8' + fp8_config['_eval_path_subdir'] = 'serve_fp8' + result_configs.append(fp8_config) + + return result_configs @pytest.mark.infer @@ -264,6 +347,64 @@ def test_pytorch_restful_tp2(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') +@pytest.mark.infer +@pytest.mark.pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=400000)) +def test_pytorch_restful_tp2_longtext(config, run_config, worker_id): + run_eval_test(config, run_config, worker_id, 'infer') + + +@pytest.mark.infer +@pytest.mark.pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=700000)) +def test_pytorch_restful_tp2_longtext_512k(config, run_config, worker_id): + run_eval_test(config, run_config, worker_id, 'infer') + + +@pytest.mark.infer +@pytest.mark.pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.mtp +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 2})) +def test_pytorch_restful_tp2_mtp(config, run_config, worker_id): + run_eval_test(config, run_config, worker_id, 'infer', eval_subpath='mtp') + + +@pytest.mark.infer +@pytest.mark.pytorch +@pytest.mark.gpu_num_1 +@pytest.mark.mtp +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 1})) +def test_pytorch_restful_tp1_mtp(config, run_config, worker_id): + run_eval_test(config, run_config, worker_id, 'infer', eval_subpath='mtp') + + +@pytest.mark.eval +@pytest.mark.pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.mtp +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 2})) +def test_pytorch_eval_tp2_mtp(config, run_config, worker_id): + run_eval_test(config, run_config, worker_id, 'eval', eval_subpath='mtp') + + +@pytest.mark.eval +@pytest.mark.pytorch +@pytest.mark.gpu_num_1 +@pytest.mark.mtp +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 1})) +def test_pytorch_eval_tp1_mtp(config, run_config, worker_id): + run_eval_test(config, run_config, worker_id, 'eval', eval_subpath='mtp') + + @pytest.mark.infer @pytest.mark.pytorch @pytest.mark.gpu_num_4 @@ -446,6 +587,24 @@ def test_pytorch_eval_distributed_dpep16(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') +@pytest.mark.eval +@pytest.mark.pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=400000)) +def test_pytorch_eval_tp2_longtext(config, run_config, worker_id): + run_eval_test(config, run_config, worker_id, 'eval') + + +@pytest.mark.eval +@pytest.mark.pytorch +@pytest.mark.gpu_num_2 +@pytest.mark.flaky(reruns=0) +@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=700000)) +def test_pytorch_eval_tp2_longtext_512k(config, run_config, worker_id): + run_eval_test(config, run_config, worker_id, 'eval') + + @pytest.mark.eval @pytest.mark.turbomind @pytest.mark.gpu_num_distributed_cp2tp8 diff --git a/autotest/tools/common_case_config.py b/autotest/tools/common_case_config.py index 3eac291a8f..f32bbc82a0 100644 --- a/autotest/tools/common_case_config.py +++ b/autotest/tools/common_case_config.py @@ -363,6 +363,53 @@ 'num_speculative_tokens': 3 } } +}, { + 'model': 'Qwen/Qwen3.5-35B-A3B', + 'communicator': 'nccl', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 2 + }, + 'extra_params': { + 'max_batch_size': 256, + 'reasoning_parser': 'qwen-qwq', + 'speculative_config': { + 'method': 'qwen3_5_mtp', + 'num_speculative_tokens': 4 + } + } +}, { + 'model': 'Qwen/Qwen3.5-35B-A3B', + 'communicator': 'nccl', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 2 + }, + 'extra_params': { + 'max_batch_size': 256, + 'reasoning_parser': 'qwen-qwq', + 'max_prefill_token_num': 1024, + 'model_format': 'fp8', + 'speculative_config': { + 'method': 'qwen3_5_mtp', + 'num_speculative_tokens': 4 + } + } +}, { + 'model': 'Qwen/Qwen3.5-35B-A3B-FP8', + 'communicator': 'nccl', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 1 + }, + 'extra_params': { + 'max_batch_size': 256, + 'reasoning_parser': 'qwen-qwq', + 'speculative_config': { + 'method': 'qwen3_5_mtp', + 'num_speculative_tokens': 4 + } + } }] SPECULATIVE_DECODING_PIPELINE_TEST_LLM = [{ @@ -383,7 +430,7 @@ 'max-batch-size': 128 } }, { - 'model': 'deepseek/DeepSeek-V3', + 'model': 'deepseek-ai/DeepSeek-V3', 'communicator': 'nccl', 'quant_policy': 0, 'parallel_config': { @@ -406,6 +453,47 @@ 'speculative-num-draft-tokens': 3, 'max-batch-size': 128 } +}, { + 'model': 'Qwen/Qwen3.5-35B-A3B', + 'communicator': 'nccl', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 2 + }, + 'extra_params': { + 'reasoning-parser': 'qwen-qwq', + 'speculative-algorithm': 'qwen3_5_mtp', + 'speculative-num-draft-tokens': 4, + 'max-batch-size': 256 + } +}, { + 'model': 'Qwen/Qwen3.5-35B-A3B', + 'communicator': 'nccl', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 2 + }, + 'extra_params': { + 'reasoning-parser': 'qwen-qwq', + 'speculative-algorithm': 'qwen3_5_mtp', + 'speculative-num-draft-tokens': 4, + 'max-batch-size': 256, + 'max-prefill-token-num': 1024, + 'model-format': 'fp8' + } +}, { + 'model': 'Qwen/Qwen3.5-35B-A3B-FP8', + 'communicator': 'nccl', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 1 + }, + 'extra_params': { + 'reasoning-parser': 'qwen-qwq', + 'speculative-algorithm': 'qwen3_5_mtp', + 'speculative-num-draft-tokens': 4, + 'max-batch-size': 256 + } }] SPECULATIVE_DECODING_RESTFUL_TEST_LLM = [{ diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index e1ea067cc3..82883d189d 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -101,10 +101,14 @@ def get_func_config_list(backend: str, if 'Qwen3-235B-A22B-Thinking-2507' in run_config['model']: run_config['extra_params']['cache-max-entry-count'] = 0.9 run_config['extra_params']['max-batch-size'] = 1024 + para_conf = run_config.get('parallel_config', {}) + if para_conf.get('dp', 0) == 8 and para_conf.get('ep', 0) == 8: + run_config['extra_params']['max-batch-size'] = 256 - if 'GLM-5' in run_config['model']: + if 'GLM-5-FP8' in run_config['model']: run_config['extra_params']['cache-max-entry-count'] = 0.9 run_config['extra_params']['max-batch-size'] = 128 + run_config['extra_params']['model-format'] = 'fp8' if 'Qwen3.5' in run_config['model']: run_config['extra_params']['session-len'] = 128000 diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py index 28d5933963..1ac6d47d1d 100644 --- a/autotest/utils/constant.py +++ b/autotest/utils/constant.py @@ -141,16 +141,40 @@ 'query_per_second': 4, 'max_out_len': 128000, 'max_seq_len': 128000, - 'batch_size': 500, + 'batch_size': 32, 'temperature': 1.0, - 'openai_extra_kwargs': { + 'extra_body': { + 'top_k': 20, + 'repetition_penalty': 1.0, 'top_p': 0.95, - 'presence_penalty': 1.5, + 'chat_template_kwargs': {'enable_thinking': True}, }, + }, + 'longtext-256k': { + 'query_per_second': 4, + 'max_out_len': 280000, + 'max_seq_len': 400000, + 'batch_size': 32, + 'temperature': 1.0, 'extra_body': { 'top_k': 20, - 'min_p': 0.0, - } + 'repetition_penalty': 1.0, + 'top_p': 0.95, + 'chat_template_kwargs': {'enable_thinking': True}, + }, + }, + 'longtext-512k': { + 'query_per_second': 4, + 'max_out_len': 700000, + 'max_seq_len': 700000, + 'batch_size': 32, + 'temperature': 1.0, + 'extra_body': { + 'top_k': 20, + 'repetition_penalty': 1.0, + 'top_p': 0.95, + 'chat_template_kwargs': {'enable_thinking': True}, + }, }, 'qwen3.5-2batch': { 'query_per_second': 4, @@ -158,14 +182,12 @@ 'max_seq_len': 128000, 'batch_size': 2, 'temperature': 1.0, - 'openai_extra_kwargs': { - 'top_p': 0.95, - 'presence_penalty': 1.5, - }, 'extra_body': { 'top_k': 20, - 'min_p': 0.0, - } + 'repetition_penalty': 1.0, + 'top_p': 0.95, + 'chat_template_kwargs': {'enable_thinking': True}, + }, }, } @@ -181,11 +203,18 @@ BACKEND_LIST = ['turbomind', 'pytorch'] +QWEN35_MTP_SERVER_EXTRA = { + 'reasoning-parser': 'qwen-qwq', + 'speculative-algorithm': 'qwen3_5_mtp', + 'speculative-num-draft-tokens': 4, + 'max-batch-size': 256, +} + RESTFUL_MODEL_LIST = [ 'Qwen/Qwen3-0.6B', 'Qwen/Qwen3-VL-2B-Instruct', 'Qwen/Qwen3-30B-A3B', 'internlm/Intern-S1', 'internlm/internlm2_5-20b', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', 'OpenGVLab/InternVL3-38B', 'Qwen/Qwen3-VL-8B-Instruct', 'internlm/internlm3-8b-instruct', 'meta-llama/Llama-3.2-3B-Instruct', - 'Qwen/Qwen3-VL-30B-A3B-Instruct' + 'Qwen/Qwen3-VL-30B-A3B-Instruct', 'Qwen/Qwen3.5-35B-A3B', 'Qwen/Qwen3.5-35B-A3B-FP8', 'Qwen/Qwen3.5-122B-A10B' ] RESTFUL_BASE_MODEL_LIST = [ diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 8535e805bf..07025b9b56 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -143,7 +143,16 @@ def mllm_summary(case_name, write_to_summary(case_name, result, msg, metrics, result_dir) -def eval_test(model_path, eval_path, case_name, port=DEFAULT_PORT, test_type='infer', extra_config={}, **kwargs): +def eval_test(model_path, + eval_path, + case_name, + port=DEFAULT_PORT, + test_type='infer', + extra_config=None, + eval_config_name='default', + **kwargs): + if extra_config is None: + extra_config = {} work_dir = None try: @@ -154,7 +163,13 @@ def eval_test(model_path, eval_path, case_name, port=DEFAULT_PORT, test_type='in current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.dirname(current_dir) - config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py') + + if eval_config_name == 'longtext-512k': + config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat_512_longtext.py') + elif eval_config_name == 'longtext-256k': + config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat_longtext.py') + else: + config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py') print(f'Starting OpenCompass evaluation for model: {model_path}') print(f'Model path: {model_path}') From 686cdc5205792478571cdde839156476c0744139 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Mon, 13 Apr 2026 16:53:58 +0800 Subject: [PATCH 09/10] fix lint --- .../evaluate/eval_config_chat_longtext.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/autotest/evaluate/eval_config_chat_longtext.py b/autotest/evaluate/eval_config_chat_longtext.py index f9ba80e263..bfd3176406 100644 --- a/autotest/evaluate/eval_config_chat_longtext.py +++ b/autotest/evaluate/eval_config_chat_longtext.py @@ -107,9 +107,9 @@ ####################################################################### datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) -needlebench_8k_summary_groups = needlebench_8k_summarizer["summary_groups"] -needlebench_32k_summary_groups = needlebench_32k_summarizer["summary_groups"] -needlebench_128k_summary_groups = needlebench_128k_summarizer["summary_groups"] +needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups'] +needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups'] +needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups'] # LLM judge config: using LLM to evaluate predictions judge_cfg = dict( @@ -145,14 +145,14 @@ summarizer = dict( dataset_abbrs=[ - ["ruler_8k", "naive_average"], - ["ruler_32k", "naive_average"], - ["ruler_64k", "naive_average"], - ["ruler_128k", "naive_average"], - ["ruler_256k", "naive_average"], - ["NeedleBench-Overall-Score-8K", "weighted_average"], - ["NeedleBench-Overall-Score-32K", "weighted_average"], - ["NeedleBench-Overall-Score-128K", "weighted_average"], + ['ruler_8k', 'naive_average'], + ['ruler_32k', 'naive_average'], + ['ruler_64k', 'naive_average'], + ['ruler_128k', 'naive_average'], + ['ruler_256k', 'naive_average'], + ['NeedleBench-Overall-Score-8K', 'weighted_average'], + ['NeedleBench-Overall-Score-32K', 'weighted_average'], + ['NeedleBench-Overall-Score-128K', 'weighted_average'], ['babilong_0k', 'naive_average'], ['babilong_4k', 'naive_average'], ['babilong_16k', 'naive_average'], From 45bcb87df25af5948ef147b8e4f750c768072535 Mon Sep 17 00:00:00 2001 From: littlegy <787321726@qq.com> Date: Mon, 13 Apr 2026 18:50:42 +0800 Subject: [PATCH 10/10] update evaluate test --- autotest/config_h.yml | 102 ++++++------ autotest/evaluate/test_api_evaluate.py | 212 ++++++++++--------------- autotest/utils/config_utils.py | 15 +- autotest/utils/constant.py | 7 - 4 files changed, 146 insertions(+), 190 deletions(-) diff --git a/autotest/config_h.yml b/autotest/config_h.yml index 675f1f6f4b..bbffee84fc 100644 --- a/autotest/config_h.yml +++ b/autotest/config_h.yml @@ -12,31 +12,31 @@ device: cuda config: tp: - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8: 4 Qwen/Qwen3-30B-A3B: 2 - openai/gpt-oss-120b: 2 - openai/gpt-oss-20b: 2 - unsloth/gpt-oss-20b-BF16: 2 - deepseek-ai/DeepSeek-V3.1: 8 - Qwen/Qwen3-30B-A3B-Base: 2 Qwen/Qwen3-235B-A22B-Thinking-2507: 8 - OpenGVLab/InternVL3_5-38B: 2 + Qwen/Qwen3-235B-A22B-Thinking-2507-FP8: 4 Qwen/Qwen3-VL-30B-A3B-Instruct: 2 - zai-org/GLM-5-FP8: 8 + Qwen/Qwen2.5-VL-32B-Instruct: 2 Qwen/Qwen3.5-27B: 2 Qwen/Qwen3.5-35B-A3B: 2 Qwen/Qwen3.5-122B-A10B: 4 + Qwen/Qwen3-30B-A3B-Base: 2 meta-llama/Llama-4-Scout-17B-16E-Instruct: 4 meta-llama/Meta-Llama-3.1-70B-Instruct: 4 + internlm/Intern-S1: 4 + internlm/Intern-S1-Pro-FP8: 16 OpenGVLab/InternVL3-38B: 2 - Qwen/Qwen2.5-VL-32B-Instruct: 2 + OpenGVLab/InternVL3_5-30B-A3B: 2 + OpenGVLab/InternVL3_5-38B: 2 + deepseek-ai/DeepSeek-V3.1: 8 deepseek-ai/DeepSeek-V2-Lite-Chat: 2 mistralai/Mixtral-8x7B-Instruct-v0.1: 2 - OpenGVLab/InternVL3_5-30B-A3B: 2 zai-org/GLM-4.7-Flash: 2 + zai-org/GLM-5-FP8: 8 google/gemma-3-27b-it: 2 - internlm/Intern-S1: 4 - internlm/Intern-S1-Pro-FP8: 16 + openai/gpt-oss-120b: 2 + openai/gpt-oss-20b: 2 + unsloth/gpt-oss-20b-BF16: 2 dp_ep: Qwen/Qwen3-235B-A22B-Thinking-2507: @@ -61,9 +61,12 @@ turbomind_chat_model: - meta-llama/Meta-Llama-3-8B-Instruct - internlm/internlm3-8b-instruct - internlm/internlm3-8b-instruct-awq + - internlm/Intern-S1 + - internlm/Intern-S1-mini - OpenGVLab/InternVL3-8B - OpenGVLab/InternVL3-38B - OpenGVLab/InternVL3_5-30B-A3B + - OpenGVLab/InternVL3_5-38B - Qwen/Qwen3-0.6B - Qwen/Qwen3-4B - Qwen/Qwen3-8B @@ -76,13 +79,10 @@ turbomind_chat_model: - Qwen/Qwen2.5-VL-32B-Instruct - Qwen/Qwen1.5-MoE-A2.7B-Chat - mistralai/Mixtral-8x7B-Instruct-v0.1 - - OpenGVLab/InternVL3_5-38B - deepseek-ai/DeepSeek-V2-Lite-Chat - THUDM/glm-4-9b-chat - openai/gpt-oss-120b - openai/gpt-oss-20b - - internlm/Intern-S1 - - internlm/Intern-S1-mini cp_tp: - Qwen/Qwen3-235B-A22B-Thinking-2507 @@ -97,9 +97,13 @@ pytorch_chat_model: - meta-llama/Meta-Llama-3.1-70B-Instruct - meta-llama/Meta-Llama-3-8B-Instruct - internlm/internlm3-8b-instruct + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - internlm/Intern-S1-Pro-FP8 - OpenGVLab/InternVL3-8B - OpenGVLab/InternVL3-38B - OpenGVLab/InternVL3_5-30B-A3B + - OpenGVLab/InternVL3_5-38B - Qwen/Qwen3-0.6B - Qwen/Qwen3-4B - Qwen/Qwen3-8B @@ -110,25 +114,21 @@ pytorch_chat_model: - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 - Qwen/Qwen3-VL-8B-Instruct - Qwen/Qwen3-VL-30B-A3B-Instruct + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-35B-A3B-FP8 + - Qwen/Qwen3.5-122B-A10B - THUDM/cogvlm-chat-hf - THUDM/cogvlm2-llama3-chinese-chat-19B - THUDM/glm-4v-9b - THUDM/glm-4-9b-chat - - zai-org/GLM-4.7-Flash - microsoft/Phi-3.5-vision-instruct - microsoft/Phi-3-vision-128k-instruct + - zai-org/GLM-4.7-Flash - zai-org/GLM-5-FP8 - - Qwen/Qwen3.5-27B - - Qwen/Qwen3.5-35B-A3B - - Qwen/Qwen3.5-35B-A3B-FP8 - - Qwen/Qwen3.5-122B-A10B - deepseek-ai/DeepSeek-V3.1 - - unsloth/gpt-oss-20b-BF16 - google/gemma-3-27b-it - - OpenGVLab/InternVL3_5-38B - - internlm/Intern-S1 - - internlm/Intern-S1-mini - - internlm/Intern-S1-Pro-FP8 + - unsloth/gpt-oss-20b-BF16 dp_ep: - Qwen/Qwen3-235B-A22B-Thinking-2507 @@ -139,9 +139,9 @@ turbomind_vl_model: - OpenGVLab/InternVL3-8B - OpenGVLab/InternVL3-38B - OpenGVLab/InternVL3_5-30B-A3B + - OpenGVLab/InternVL3_5-38B - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct - - OpenGVLab/InternVL3_5-38B - internlm/Intern-S1 - internlm/Intern-S1-mini @@ -151,15 +151,15 @@ pytorch_vl_model: - OpenGVLab/InternVL3_5-30B-A3B - Qwen/Qwen3-VL-8B-Instruct - Qwen/Qwen3-VL-30B-A3B-Instruct + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-35B-A3B-FP8 + - Qwen/Qwen3.5-122B-A10B - THUDM/cogvlm-chat-hf - THUDM/cogvlm2-llama3-chinese-chat-19B - THUDM/glm-4v-9b - microsoft/Phi-3-vision-128k-instruct - microsoft/Phi-3.5-vision-instruct - - Qwen/Qwen3.5-27B - - Qwen/Qwen3.5-35B-A3B - - Qwen/Qwen3.5-35B-A3B-FP8 - - Qwen/Qwen3.5-122B-A10B - internlm/Intern-S1 - internlm/Intern-S1-mini @@ -178,6 +178,9 @@ turbomind_quantization: no_awq: - meta-llama/Meta-Llama-3.1-70B-Instruct - internlm/internlm3-8b-instruct + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - OpenGVLab/InternVL3_5-30B-A3B - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B-Thinking-2507 - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 @@ -188,15 +191,12 @@ turbomind_quantization: - Qwen/Qwen1.5-MoE-A2.7B-Chat - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct - - OpenGVLab/InternVL3_5-30B-A3B - deepseek-ai/DeepSeek-V2-Lite-Chat - - openai/gpt-oss-120b - - openai/gpt-oss-20b - - microsoft/Phi-3-mini-4k-instruct - THUDM/glm-4v-9b - THUDM/glm-4-9b-chat - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - microsoft/Phi-3-mini-4k-instruct + - openai/gpt-oss-120b + - openai/gpt-oss-20b gptq: - empty @@ -219,10 +219,10 @@ turbomind_quantization: - Qwen/Qwen3-8B-Base - Qwen/Qwen3-30B-A3B-Base - deepseek-ai/DeepSeek-V2-Lite-Chat - - openai/gpt-oss-120b - - openai/gpt-oss-20b - internlm/Intern-S1 - internlm/Intern-S1-mini + - openai/gpt-oss-120b + - openai/gpt-oss-20b no_kvint8: - deepseek-ai/DeepSeek-V2-Chat - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 @@ -246,8 +246,6 @@ pytorch_quantization: - meta-llama/Llama-3.2-1B-Instruct - OpenGVLab/InternVL3-2B - OpenGVLab/InternVL3-8B - - Qwen/Qwen3-8B-Base - - Qwen/Qwen3-30B-A3B-Base - Qwen/Qwen3-0.6B - Qwen/Qwen3-4B - Qwen/Qwen3-8B @@ -259,13 +257,15 @@ pytorch_quantization: - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 - Qwen/Qwen3-VL-8B-Instruct - Qwen/Qwen3-VL-30B-A3B-Instruct + - Qwen/Qwen3-8B-Base + - Qwen/Qwen3-30B-A3B-Base + - Qwen/Qwen3.5-27B + - Qwen/Qwen3.5-35B-A3B + - Qwen/Qwen3.5-122B-A10B - microsoft/Phi-3-vision-128k-instruct - microsoft/Phi-3.5-vision-instruct - zai-org/GLM-4.7-Flash - zai-org/GLM-5-FP8 - - Qwen/Qwen3.5-27B - - Qwen/Qwen3.5-35B-A3B - - Qwen/Qwen3.5-122B-A10B - deepseek-ai/DeepSeek-V3.1 - internlm/Intern-S1 - internlm/Intern-S1-mini @@ -284,11 +284,11 @@ pytorch_quantization: longtext_benchmark_model: - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B-Thinking-2507 - - zai-org/GLM-5-FP8 - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3.5-122B-A10B + - zai-org/GLM-5-FP8 benchmark_model: - meta-llama/Meta-Llama-3.1-8B-Instruct @@ -296,14 +296,14 @@ benchmark_model: - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B-Thinking-2507 - Qwen/Qwen2.5-72B-Instruct - - openai/gpt-oss-120b - - openai/gpt-oss-20b - - unsloth/gpt-oss-20b-BF16 - - zai-org/GLM-5-FP8 - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3.5-122B-A10B + - openai/gpt-oss-120b + - openai/gpt-oss-20b + - unsloth/gpt-oss-20b-BF16 + - zai-org/GLM-5-FP8 - google/gemma-3-27b-it - internlm/Intern-S1 - internlm/Intern-S1-mini @@ -314,13 +314,13 @@ evaluate_model: - Qwen/Qwen3-30B-A3B-FP8 - Qwen/Qwen3-235B-A22B-Thinking-2507 - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 - - openai/gpt-oss-120b - - deepseek-ai/DeepSeek-V3.1 - - zai-org/GLM-5-FP8 - Qwen/Qwen3.5-27B - Qwen/Qwen3.5-35B-A3B - Qwen/Qwen3.5-35B-A3B-FP8 - Qwen/Qwen3.5-122B-A10B + - openai/gpt-oss-120b + - deepseek-ai/DeepSeek-V3.1 + - zai-org/GLM-5-FP8 - internlm/Intern-S1-Pro-FP8 longtext_evaluate_model: diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py index 7877b29c04..3f72825ee0 100644 --- a/autotest/evaluate/test_api_evaluate.py +++ b/autotest/evaluate/test_api_evaluate.py @@ -1,4 +1,3 @@ -import copy import os import time @@ -116,30 +115,22 @@ def _run_proxy_distributed_test(config, def run_eval_test(config, run_config, worker_id, test_type='infer', eval_config_name='default', eval_subpath=None): """Run test with specified evaluation configuration.""" if eval_config_name == 'default': - longtext_key = run_config.get('_longtext_eval_config_name') - if longtext_key: - eval_config_name = longtext_key - else: - if 'gpt' in run_config.get('model', '').lower(): - eval_config_name = 'gpt' - elif 'sdar' in run_config.get('model', '').lower(): - eval_config_name = 'sdar' - elif 'intern-s1-pro' in run_config.get('model', '').lower(): - eval_config_name = 'intern-s1-pro' - elif 'qwen3.5' in run_config.get('model', '').lower(): - eval_config_name = 'qwen3.5' - if str(config.get('env_tag')) == 'a100': - eval_config_name = f'{eval_config_name}-32k' - elif str(config.get('env_tag')) == 'ascend': - eval_config_name = f'{eval_config_name}-2batch' + if 'gpt' in run_config.get('model', '').lower(): + eval_config_name = 'gpt' + elif 'sdar' in run_config.get('model', '').lower(): + eval_config_name = 'sdar' + elif 'intern-s1-pro' in run_config.get('model', '').lower(): + eval_config_name = 'intern-s1-pro' + elif 'qwen3.5' in run_config.get('model', '').lower(): + eval_config_name = 'qwen3.5' + if str(config.get('env_tag')) == 'a100': + eval_config_name = f'{eval_config_name}-32k' + elif str(config.get('env_tag')) == 'ascend': + eval_config_name = f'{eval_config_name}-2batch' preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {}) eval_path = config.get('eval_path') if eval_subpath: - rel = eval_subpath - nested = run_config.get('_eval_path_subdir') - if nested: - rel = os.path.join(rel, nested) - eval_path = os.path.join(eval_path, rel) + eval_path = os.path.join(eval_path, eval_subpath) os.makedirs(eval_path, exist_ok=True) total_gpus = int(os.environ.get('TOTAL_GPU_COUNT', '8')) @@ -213,80 +204,11 @@ def run_openai_service_start(i): stop_restful_api(proxy_pid, proxy_process) -def get_models(backend, parallel_config, session_len='auto'): - if session_len == 'auto': - configs = get_func_config_list(backend, parallel_config, func_type='evaluate', extra={}) - result = [] - for config in configs: - model = config.get('model', '') - if 'Qwen3.5' not in model: - if 'extra_params' not in config: - config['extra_params'] = {} - config['extra_params']['session_len'] = 65536 - result.append(config) - return result - else: - extra = {'session_len': session_len} if session_len is not None else {} - return get_func_config_list(backend, parallel_config, func_type='evaluate', extra=extra) - - -def _resolve_longtext_eval_config_name(run_config: dict) -> str | None: - """Map longtext_evaluate config to EVAL_CONFIGS key; add branches when new - longtext families ship.""" - ep = run_config.get('extra_params') or {} - raw = ep.get('session_len', ep.get('session-len')) - if raw is None: - return None - try: - sl = int(raw) - except (TypeError, ValueError): - return None - model_lower = (run_config.get('model') or '').lower() - if 'qwen3.5' in model_lower: - if sl >= 600000: - return 'longtext-512k' - if sl >= 300000: - return 'longtext-256k' - return None - - -def get_longtext_models(backend, parallel_config, session_len='auto'): - if session_len == 'auto': - session_len = 65536 - extra = {'session_len': session_len} if session_len is not None else {} - configs = get_func_config_list(backend, parallel_config, func_type='longtext_evaluate', extra=extra) - for cfg in configs: - preset_key = _resolve_longtext_eval_config_name(cfg) - if preset_key: - cfg['_longtext_eval_config_name'] = preset_key - return configs - - -def get_mtp_models(backend, parallel_config): - base_configs = get_func_config_list(backend, parallel_config, func_type='mtp_evaluate', extra={}) - for cfg in base_configs: - if 'qwen3.5' in cfg.get('model', '').lower(): - cfg['extra_params'].update(constant.QWEN35_MTP_SERVER_EXTRA) - - result_configs = [] - for config in base_configs: - result_configs.append(config) - - if config.get('model') == 'Qwen/Qwen3.5-35B-A3B' and parallel_config.get('tp') == 2: - fp8_config = copy.deepcopy(config) - fp8_config['extra_params']['max-prefill-token-num'] = 1024 - fp8_config['extra_params']['model-format'] = 'fp8' - fp8_config['_eval_path_subdir'] = 'serve_fp8' - result_configs.append(fp8_config) - - return result_configs - - @pytest.mark.infer @pytest.mark.turbomind @pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 1})) +@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 1}, func_type='evaluate')) def test_turbomind_infer_tp1(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') @@ -295,7 +217,7 @@ def test_turbomind_infer_tp1(config, run_config, worker_id): @pytest.mark.turbomind @pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 2})) +@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 2}, func_type='evaluate')) def test_turbomind_infer_tp2(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') @@ -304,7 +226,7 @@ def test_turbomind_infer_tp2(config, run_config, worker_id): @pytest.mark.turbomind @pytest.mark.gpu_num_4 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 4})) +@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 4}, func_type='evaluate')) def test_turbomind_infer_tp4(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') @@ -313,7 +235,7 @@ def test_turbomind_infer_tp4(config, run_config, worker_id): @pytest.mark.turbomind @pytest.mark.gpu_num_8 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 8})) +@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 8}, func_type='evaluate')) def test_turbomind_infer_tp8(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') @@ -322,7 +244,7 @@ def test_turbomind_infer_tp8(config, run_config, worker_id): @pytest.mark.turbomind @pytest.mark.gpu_num_distributed_cp2tp8 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('turbomind', {'cp': 2, 'tp': 8})) +@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'cp': 2, 'tp': 8}, func_type='evaluate')) def test_turbomind_infer_cp2tp8(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') @@ -332,7 +254,7 @@ def test_turbomind_infer_cp2tp8(config, run_config, worker_id): @pytest.mark.gpu_num_1 @pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 1})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 1}, func_type='evaluate')) def test_pytorch_restful_tp1(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') @@ -342,7 +264,7 @@ def test_pytorch_restful_tp1(config, run_config, worker_id): @pytest.mark.gpu_num_2 @pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 2})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 2}, func_type='evaluate')) def test_pytorch_restful_tp2(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') @@ -351,18 +273,34 @@ def test_pytorch_restful_tp2(config, run_config, worker_id): @pytest.mark.pytorch @pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=400000)) +@pytest.mark.parametrize( + 'run_config', + get_func_config_list( + 'pytorch', + {'tp': 2}, + func_type='longtext_evaluate', + extra={'session_len': 400000}, + ), +) def test_pytorch_restful_tp2_longtext(config, run_config, worker_id): - run_eval_test(config, run_config, worker_id, 'infer') + run_eval_test(config, run_config, worker_id, 'infer', eval_config_name='longtext-256k') @pytest.mark.infer @pytest.mark.pytorch @pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=700000)) +@pytest.mark.parametrize( + 'run_config', + get_func_config_list( + 'pytorch', + {'tp': 2}, + func_type='longtext_evaluate', + extra={'session_len': 700000}, + ), +) def test_pytorch_restful_tp2_longtext_512k(config, run_config, worker_id): - run_eval_test(config, run_config, worker_id, 'infer') + run_eval_test(config, run_config, worker_id, 'infer', eval_config_name='longtext-512k') @pytest.mark.infer @@ -370,7 +308,7 @@ def test_pytorch_restful_tp2_longtext_512k(config, run_config, worker_id): @pytest.mark.gpu_num_2 @pytest.mark.mtp @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 2})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 2}, func_type='mtp_evaluate')) def test_pytorch_restful_tp2_mtp(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer', eval_subpath='mtp') @@ -380,7 +318,7 @@ def test_pytorch_restful_tp2_mtp(config, run_config, worker_id): @pytest.mark.gpu_num_1 @pytest.mark.mtp @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 1})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 1}, func_type='mtp_evaluate')) def test_pytorch_restful_tp1_mtp(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer', eval_subpath='mtp') @@ -390,7 +328,7 @@ def test_pytorch_restful_tp1_mtp(config, run_config, worker_id): @pytest.mark.gpu_num_2 @pytest.mark.mtp @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 2})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 2}, func_type='mtp_evaluate')) def test_pytorch_eval_tp2_mtp(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval', eval_subpath='mtp') @@ -400,7 +338,7 @@ def test_pytorch_eval_tp2_mtp(config, run_config, worker_id): @pytest.mark.gpu_num_1 @pytest.mark.mtp @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 1})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 1}, func_type='mtp_evaluate')) def test_pytorch_eval_tp1_mtp(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval', eval_subpath='mtp') @@ -410,7 +348,7 @@ def test_pytorch_eval_tp1_mtp(config, run_config, worker_id): @pytest.mark.gpu_num_4 @pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 4})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 4}, func_type='evaluate')) def test_pytorch_restful_tp4(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') @@ -420,7 +358,7 @@ def test_pytorch_restful_tp4(config, run_config, worker_id): @pytest.mark.gpu_num_8 @pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 8})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 8}, func_type='evaluate')) def test_pytorch_restful_tp8(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') @@ -430,7 +368,7 @@ def test_pytorch_restful_tp8(config, run_config, worker_id): @pytest.mark.gpu_num_16 @pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 16})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 16}, func_type='evaluate')) def test_pytorch_restful_tp16(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'infer') @@ -439,7 +377,7 @@ def test_pytorch_restful_tp16(config, run_config, worker_id): @pytest.mark.pytorch @pytest.mark.gpu_num_distributed_tp16 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 16})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 16}, func_type='evaluate')) def test_pytorch_restful_distributed_tp16(shared_ray_manager, config, run_config, worker_id): _run_ray_distributed_test(config=config, run_config=run_config, @@ -452,7 +390,7 @@ def test_pytorch_restful_distributed_tp16(shared_ray_manager, config, run_config @pytest.mark.pytorch @pytest.mark.gpu_num_distributed_dpep8 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'dp': 8, 'ep': 8})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'dp': 8, 'ep': 8}, func_type='evaluate')) def test_pytorch_restful_distributed_dpep8(shared_proxy_manager, config, run_config, worker_id): _run_proxy_distributed_test(config=config, run_config=run_config, @@ -465,7 +403,7 @@ def test_pytorch_restful_distributed_dpep8(shared_proxy_manager, config, run_con @pytest.mark.pytorch @pytest.mark.gpu_num_distributed_dpep16 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'dp': 16, 'ep': 16})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'dp': 16, 'ep': 16}, func_type='evaluate')) def test_pytorch_restful_distributed_dpep16(shared_proxy_manager, config, run_config, worker_id): _run_proxy_distributed_test(config=config, run_config=run_config, @@ -478,7 +416,7 @@ def test_pytorch_restful_distributed_dpep16(shared_proxy_manager, config, run_co @pytest.mark.turbomind @pytest.mark.gpu_num_1 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 1})) +@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 1}, func_type='evaluate')) def test_turbomind_eval_tp1(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -487,7 +425,7 @@ def test_turbomind_eval_tp1(config, run_config, worker_id): @pytest.mark.turbomind @pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 2})) +@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 2}, func_type='evaluate')) def test_turbomind_eval_tp2(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -496,7 +434,7 @@ def test_turbomind_eval_tp2(config, run_config, worker_id): @pytest.mark.turbomind @pytest.mark.gpu_num_4 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 4})) +@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 4}, func_type='evaluate')) def test_turbomind_eval_tp4(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -505,7 +443,7 @@ def test_turbomind_eval_tp4(config, run_config, worker_id): @pytest.mark.turbomind @pytest.mark.gpu_num_8 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 8})) +@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 8}, func_type='evaluate')) def test_turbomind_eval_tp8(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -515,7 +453,7 @@ def test_turbomind_eval_tp8(config, run_config, worker_id): @pytest.mark.gpu_num_1 @pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 1})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 1}, func_type='evaluate')) def test_pytorch_eval_tp1(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -525,7 +463,7 @@ def test_pytorch_eval_tp1(config, run_config, worker_id): @pytest.mark.gpu_num_2 @pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 2})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 2}, func_type='evaluate')) def test_pytorch_eval_tp2(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -535,7 +473,7 @@ def test_pytorch_eval_tp2(config, run_config, worker_id): @pytest.mark.gpu_num_4 @pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 4})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 4}, func_type='evaluate')) def test_pytorch_eval_tp4(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -545,7 +483,7 @@ def test_pytorch_eval_tp4(config, run_config, worker_id): @pytest.mark.gpu_num_8 @pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 8})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 8}, func_type='evaluate')) def test_pytorch_eval_tp8(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -555,7 +493,7 @@ def test_pytorch_eval_tp8(config, run_config, worker_id): @pytest.mark.gpu_num_16 @pytest.mark.test_ascend @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 16})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 16}, func_type='evaluate')) def test_pytorch_eval_tp16(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -564,7 +502,7 @@ def test_pytorch_eval_tp16(config, run_config, worker_id): @pytest.mark.pytorch @pytest.mark.gpu_num_distributed_tp16 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 16})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 16}, func_type='evaluate')) def test_pytorch_eval_distributed_tp16(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -573,7 +511,7 @@ def test_pytorch_eval_distributed_tp16(config, run_config, worker_id): @pytest.mark.pytorch @pytest.mark.gpu_num_distributed_dpep8 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'dp': 8, 'ep': 8})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'dp': 8, 'ep': 8}, func_type='evaluate')) def test_pytorch_eval_distributed_dpep8(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -582,7 +520,7 @@ def test_pytorch_eval_distributed_dpep8(config, run_config, worker_id): @pytest.mark.pytorch @pytest.mark.gpu_num_distributed_dpep16 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('pytorch', {'dp': 16, 'ep': 16})) +@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'dp': 16, 'ep': 16}, func_type='evaluate')) def test_pytorch_eval_distributed_dpep16(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') @@ -591,24 +529,40 @@ def test_pytorch_eval_distributed_dpep16(config, run_config, worker_id): @pytest.mark.pytorch @pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=400000)) +@pytest.mark.parametrize( + 'run_config', + get_func_config_list( + 'pytorch', + {'tp': 2}, + func_type='longtext_evaluate', + extra={'session_len': 400000}, + ), +) def test_pytorch_eval_tp2_longtext(config, run_config, worker_id): - run_eval_test(config, run_config, worker_id, 'eval') + run_eval_test(config, run_config, worker_id, 'eval', eval_config_name='longtext-256k') @pytest.mark.eval @pytest.mark.pytorch @pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=700000)) +@pytest.mark.parametrize( + 'run_config', + get_func_config_list( + 'pytorch', + {'tp': 2}, + func_type='longtext_evaluate', + extra={'session_len': 700000}, + ), +) def test_pytorch_eval_tp2_longtext_512k(config, run_config, worker_id): - run_eval_test(config, run_config, worker_id, 'eval') + run_eval_test(config, run_config, worker_id, 'eval', eval_config_name='longtext-512k') @pytest.mark.eval @pytest.mark.turbomind @pytest.mark.gpu_num_distributed_cp2tp8 @pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_models('turbomind', {'cp': 2, 'tp': 8})) +@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'cp': 2, 'tp': 8}, func_type='evaluate')) def test_turbomind_eval_cp2tp8(config, run_config, worker_id): run_eval_test(config, run_config, worker_id, 'eval') diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 82883d189d..6e02c1efbb 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -49,7 +49,7 @@ def get_func_config_list(backend: str, parallel_config: Parallel config for tensor parallel model_type: Model type, default: chat_model func_type: Test func type filter, default: func - extra: extra config to update in each run config dict + extra: extra config merged into each run config's extra_params. Returns: list[dict]: All valid run config dicts """ @@ -110,8 +110,9 @@ def get_func_config_list(backend: str, run_config['extra_params']['max-batch-size'] = 128 run_config['extra_params']['model-format'] = 'fp8' - if 'Qwen3.5' in run_config['model']: - run_config['extra_params']['session-len'] = 128000 + if (func_type == 'evaluate' and 'session_len' not in extra + and 'session-len' not in extra and 'Qwen3.5' not in run_config['model']): + run_config['extra_params']['session_len'] = 65536 if config.get('env_tag', '') in ['3090', '5080']: run_config['extra_params']['cache-max-entry-count'] = 0.5 @@ -143,6 +144,14 @@ def get_func_config_list(backend: str, and func_type in ('benchmark', 'longtext_benchmark')): run_config['extra_params']['model-format'] = 'mxfp4' + if func_type == 'mtp_evaluate' and 'Qwen3.5' in run_config['model']: + run_config['extra_params'].update({ + 'reasoning-parser': 'qwen-qwq', + 'speculative-algorithm': 'qwen3_5_mtp', + 'speculative-num-draft-tokens': 4, + 'max-batch-size': 256, + }) + return run_configs diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py index 1ac6d47d1d..9a366fca55 100644 --- a/autotest/utils/constant.py +++ b/autotest/utils/constant.py @@ -203,13 +203,6 @@ BACKEND_LIST = ['turbomind', 'pytorch'] -QWEN35_MTP_SERVER_EXTRA = { - 'reasoning-parser': 'qwen-qwq', - 'speculative-algorithm': 'qwen3_5_mtp', - 'speculative-num-draft-tokens': 4, - 'max-batch-size': 256, -} - RESTFUL_MODEL_LIST = [ 'Qwen/Qwen3-0.6B', 'Qwen/Qwen3-VL-2B-Instruct', 'Qwen/Qwen3-30B-A3B', 'internlm/Intern-S1', 'internlm/internlm2_5-20b', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', 'OpenGVLab/InternVL3-38B',