From c17c78b973467cd91688eba28b6af49e54062980 Mon Sep 17 00:00:00 2001
From: root <root@xtuner-test.qa-llm-cicd.ailab-llmrazor.svc.pjlab.local>
Date: Wed, 18 Mar 2026 15:11:08 +0800
Subject: [PATCH 01/10] update config h and add glm4.7 mtp test

---
 autotest/config_h.yml                         | 268 ++++++++++++------
 autotest/config_h_legacy.yml                  |  26 +-
 autotest/evaluate/test_api_evaluate.py        |  10 +-
 autotest/tools/common_case_config.py          |  26 ++
 .../test_pipeline_chat_pytorch_llm.py         |  10 +
 .../test_restful_chat_hf_pytorch_llm.py       |  10 +
 autotest/utils/benchmark_utils.py             |  26 +-
 autotest/utils/config_utils.py                |  10 +
 autotest/utils/constant.py                    |  32 ++-
 autotest/utils/run_client_chat.py             |  21 +-
 10 files changed, 332 insertions(+), 107 deletions(-)

diff --git a/autotest/config_h.yml b/autotest/config_h.yml
index 667033f36c..b0ed3a8f87 100644
--- a/autotest/config_h.yml
+++ b/autotest/config_h.yml
@@ -12,33 +12,34 @@ device: cuda
 
 config:
     tp:
-        Qwen/Qwen3-235B-A22B-FP8: 4
-        internlm/Intern-S1: 4
         Qwen/Qwen3-235B-A22B-Thinking-2507-FP8: 4
         Qwen/Qwen3-30B-A3B: 2
-        Qwen/Qwen3-32B: 2
         openai/gpt-oss-120b: 2
-        openai/gpt-oss-120b-BF16: 4
-        openai/gpt-oss-20b-BF16: 2
-        deepseek/DeepSeek-V3.1: 8
+        openai/gpt-oss-20b: 2
+        unsloth/gpt-oss-20b-BF16: 2
+        deepseek-ai/DeepSeek-V3.1: 8
         Qwen/Qwen3-30B-A3B-Base: 2
-        JetLM/SDAR-30B-A3B-Sci: 2
-        moonshotai/Kimi-K2-Instruct-0905: 16
         Qwen/Qwen3-235B-A22B-Thinking-2507: 8
         OpenGVLab/InternVL3_5-38B: 2
         Qwen/Qwen3-VL-30B-A3B-Instruct: 2
-        internlm/Intern-S1-Pro-FP8: 16
+        zai-org/GLM-5: 16
+        Qwen/Qwen3.5-27B: 2
+        Qwen/Qwen3.5-35B-A3B: 2
+        Qwen/Qwen3.5-122B-A10B: 4
+        meta-llama/Llama-4-Scout-17B-16E-Instruct: 4
+        meta-llama/Meta-Llama-3.1-70B-Instruct: 4
+        OpenGVLab/InternVL3-38B: 2
+        Qwen/Qwen2.5-VL-32B-Instruct: 2
+        deepseek-ai/DeepSeek-V2-Lite-Chat: 2
+        mistralai/Mixtral-8x7B-Instruct-v0.1: 2
+        OpenGVLab/InternVL3_5-30B-A3B: 2
+        zai-org/GLM-4.7-Flash: 2
+        google/gemma-3-27b-it: 2
 
     dp_ep:
-        moonshotai/Kimi-K2-Instruct-0905:
-            dp: 16
-            ep: 16
         Qwen/Qwen3-235B-A22B-Thinking-2507:
             dp: 8
             ep: 8
-        internlm/Intern-S1-Pro-FP8:
-            dp: 16
-            ep: 16
 
     cp_tp:
         Qwen/Qwen3-235B-A22B-Thinking-2507:
@@ -48,64 +49,109 @@ config:
 
 turbomind_chat_model:
     tp:
-        - Qwen/Qwen3-0.6B-FP8
-        - Qwen/Qwen3-1.7B-FP8
-        - Qwen/Qwen3-4B-FP8
-        - Qwen/Qwen3-8B-FP8
-        - Qwen/Qwen3-14B-FP8
-        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
-        - Qwen/Qwen3-235B-A22B-Thinking-2507
+        - meta-llama/Llama-3.2-1B-Instruct
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Meta-Llama-3.1-8B-Instruct
+        - meta-llama/Meta-Llama-3.1-70B-Instruct
+        - meta-llama/Meta-Llama-3-8B-Instruct
+        - internlm/internlm3-8b-instruct
+        - internlm/internlm3-8b-instruct-awq
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-38B
+        - OpenGVLab/InternVL3_5-30B-A3B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-32B-FP8
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-30B-A3B-FP8
-        - Qwen/Qwen3-32B
-        - Qwen/Qwen3-32B-FP8
+        - Qwen/Qwen3-235B-A22B-Thinking-2507
+        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
+        - Qwen/Qwen2.5-VL-7B-Instruct
+        - Qwen/Qwen2.5-VL-32B-Instruct
+        - Qwen/Qwen1.5-MoE-A2.7B-Chat
+        - mistralai/Mixtral-8x7B-Instruct-v0.1
         - OpenGVLab/InternVL3_5-38B
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+        - THUDM/glm-4-9b-chat
         - openai/gpt-oss-120b
         - openai/gpt-oss-20b
 
     cp_tp:
         - Qwen/Qwen3-235B-A22B-Thinking-2507
 
+
 pytorch_chat_model:
     tp:
-        - Qwen/Qwen3-0.6B-FP8
-        - Qwen/Qwen3-1.7B-FP8
-        - Qwen/Qwen3-4B-FP8
-        - Qwen/Qwen3-8B-FP8
-        - Qwen/Qwen3-14B-FP8
-        - Qwen/Qwen3-235B-A22B-Thinking-2507
-        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
+        - meta-llama/Llama-4-Scout-17B-16E-Instruct
+        - meta-llama/Llama-3.2-1B-Instruct
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Meta-Llama-3.1-8B-Instruct
+        - meta-llama/Meta-Llama-3.1-70B-Instruct
+        - meta-llama/Meta-Llama-3-8B-Instruct
+        - internlm/internlm3-8b-instruct
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-38B
+        - OpenGVLab/InternVL3_5-30B-A3B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-32B-FP8
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-30B-A3B-FP8
-        - Qwen/Qwen3-32B
-        - Qwen/Qwen3-32B-FP8
+        - Qwen/Qwen3-235B-A22B-Thinking-2507
+        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
+        - Qwen/Qwen3-VL-8B-Instruct
         - Qwen/Qwen3-VL-30B-A3B-Instruct
-        - OpenGVLab/InternVL3_5-38B
-        - unsloth/gpt-oss-120b-BF16
+        - THUDM/cogvlm-chat-hf
+        - THUDM/cogvlm2-llama3-chinese-chat-19B
+        - THUDM/glm-4v-9b
+        - THUDM/glm-4-9b-chat
+        - zai-org/GLM-4.7-Flash
+        - microsoft/Phi-3.5-vision-instruct
+        - microsoft/Phi-3-vision-128k-instruct
+        - zai-org/GLM-5
+        - Qwen/Qwen3.5-27B
+        - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-122B-A10B
+        - deepseek-ai/DeepSeek-V3.1
         - unsloth/gpt-oss-20b-BF16
-        - deepseek/DeepSeek-V3.1
-        - moonshotai/Kimi-K2-Instruct-0905
-        - internlm/Intern-S1-Pro-FP8
-        - JetLM/SDAR-30B-A3B-Sci
+        - google/gemma-3-27b-it
+        - OpenGVLab/InternVL3_5-38B
+
     dp_ep:
-        - moonshotai/Kimi-K2-Instruct-0905
         - Qwen/Qwen3-235B-A22B-Thinking-2507
-        - internlm/Intern-S1-Pro-FP8
 
 turbomind_vl_model:
     tp:
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-38B
+        - OpenGVLab/InternVL3_5-30B-A3B
+        - Qwen/Qwen2.5-VL-7B-Instruct
+        - Qwen/Qwen2.5-VL-32B-Instruct
         - OpenGVLab/InternVL3_5-38B
 
-
 pytorch_vl_model:
     tp:
-        - OpenGVLab/InternVL3_5-38B
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3_5-30B-A3B
+        - Qwen/Qwen3-VL-8B-Instruct
         - Qwen/Qwen3-VL-30B-A3B-Instruct
+        - THUDM/cogvlm-chat-hf
+        - THUDM/cogvlm2-llama3-chinese-chat-19B
+        - THUDM/glm-4v-9b
+        - microsoft/Phi-3-vision-128k-instruct
+        - microsoft/Phi-3.5-vision-instruct
+        - Qwen/Qwen3.5-27B
+        - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-122B-A10B
+        - OpenGVLab/InternVL3_5-38B
+
 
 turbomind_base_model:
     tp:
-        - Qwen/Qwen3-4B-FP8
-        - openai/gpt-oss-20b
+        - Qwen/Qwen3-8B-Base
+        - Qwen/Qwen3-30B-A3B-Base
 
 pytorch_base_model:
     tp:
@@ -114,94 +160,134 @@ pytorch_base_model:
 
 turbomind_quantization:
     no_awq:
-        - Qwen/Qwen3-0.6B-FP8
-        - Qwen/Qwen3-1.7B-FP8
-        - Qwen/Qwen3-4B-FP8
-        - Qwen/Qwen3-8B-FP8
-        - Qwen/Qwen3-14B-FP8
-        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
+        - meta-llama/Meta-Llama-3.1-70B-Instruct
+        - internlm/internlm3-8b-instruct
         - Qwen/Qwen3-30B-A3B
-        - Qwen/Qwen3-30B-A3B-FP8
-        - Qwen/Qwen3-32B
+        - Qwen/Qwen3-235B-A22B-Thinking-2507
+        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
         - Qwen/Qwen3-32B-FP8
+        - Qwen/Qwen3-30B-A3B-FP8
+        - Qwen/Qwen3-30B-A3B-Base
+        - Qwen/Qwen1.5-MoE-A2.7B-Chat
+        - Qwen/Qwen2.5-VL-7B-Instruct
+        - Qwen/Qwen2.5-VL-32B-Instruct
+        - OpenGVLab/InternVL3_5-30B-A3B
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
         - openai/gpt-oss-120b
         - openai/gpt-oss-20b
-        - Qwen/Qwen3-235B-A22B-Thinking-2507
+        - microsoft/Phi-3-mini-4k-instruct
+        - THUDM/glm-4v-9b
+        - THUDM/glm-4-9b-chat
+
     gptq:
         - empty
     no_kvint4:
-        - Qwen/Qwen3-0.6B-FP8
-        - Qwen/Qwen3-1.7B-FP8
-        - Qwen/Qwen3-4B-FP8
-        - Qwen/Qwen3-8B-FP8
-        - Qwen/Qwen3-14B-FP8
-        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
+        - meta-llama/Llama-3.2-1B-Instruct
+        - OpenGVLab/InternVL3-2B
+        - OpenGVLab/InternVL3-8B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-32B-FP8
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-30B-A3B-FP8
-        - Qwen/Qwen3-32B
-        - Qwen/Qwen3-32B-FP8
+        - Qwen/Qwen3-235B-A22B-Thinking-2507
+        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
+        - Qwen/Qwen2.5-VL-7B-Instruct
+        - Qwen/Qwen2.5-VL-32B-Instruct
+        - Qwen/Qwen1.5-MoE-A2.7B-Chat
+        - Qwen/Qwen3-8B-Base
+        - Qwen/Qwen3-30B-A3B-Base
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
         - openai/gpt-oss-120b
         - openai/gpt-oss-20b
-        - Qwen/Qwen3-235B-A22B-Thinking-2507
     no_kvint8:
-        - Qwen/Qwen3-235B-A22B-Thinking-2507
+        - deepseek-ai/DeepSeek-V2-Chat
+        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
 
 pytorch_quantization:
     awq:
-        - empty
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Meta-Llama-3-8B-Instruct
+        - meta-llama/Meta-Llama-3.1-8B-Instruct
+        - internlm/internlm3-8b-instruct
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-8B
     w8a8:
-        - empty
+        - meta-llama/Llama-3.2-1B-Instruct
+        - meta-llama/Meta-Llama-3-8B-Instruct
+        - meta-llama/Meta-Llama-3.1-8B-Instruct
+        - internlm/internlm3-8b-instruct
+        - microsoft/Phi-3-mini-4k-instruct
     no_kvint4:
+        - meta-llama/Llama-3.2-1B-Instruct
+        - OpenGVLab/InternVL3-2B
+        - OpenGVLab/InternVL3-8B
         - Qwen/Qwen3-8B-Base
-        - Qwen/Qwen3-0.6B-FP8
-        - Qwen/Qwen3-1.7B-FP8
-        - Qwen/Qwen3-4B-FP8
-        - Qwen/Qwen3-8B-FP8
-        - Qwen/Qwen3-14B-FP8
-        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
+        - Qwen/Qwen3-30B-A3B-Base
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-32B-FP8
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-30B-A3B-FP8
-        - Qwen/Qwen3-32B
-        - Qwen/Qwen3-32B-FP8
-        - moonshotai/Kimi-K2-Instruct-0905
         - Qwen/Qwen3-235B-A22B-Thinking-2507
-        - internlm/Intern-S1-Pro-FP8
-        - JetLM/SDAR-30B-A3B-Sci
-        - deepseek/DeepSeek-V3.1
+        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
+        - Qwen/Qwen3-VL-8B-Instruct
+        - Qwen/Qwen3-VL-30B-A3B-Instruct
+        - microsoft/Phi-3-vision-128k-instruct
+        - microsoft/Phi-3.5-vision-instruct
+        - zai-org/GLM-4.7-Flash
+        - zai-org/GLM-5
+        - Qwen/Qwen3.5-27B
+        - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-122B-A10B
+        - deepseek-ai/DeepSeek-V3.1
     no_kvint8:
-        - Qwen/Qwen3-235B-A22B-Thinking-2507
-        - internlm/Intern-S1-Pro-FP8
-        - deepseek/DeepSeek-V3.1
+        - zai-org/GLM-4.7-Flash
+        - zai-org/GLM-5
+        - deepseek-ai/DeepSeek-V3.1
+        - Qwen/Qwen3.5-27B
+        - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-122B-A10B
+        - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
 
 longtext_model:
     - Qwen/Qwen3-30B-A3B
     - Qwen/Qwen3-235B-A22B-Thinking-2507
+    - zai-org/GLM-5
+    - Qwen/Qwen3.5-27B
+    - Qwen/Qwen3.5-35B-A3B
+    - Qwen/Qwen3.5-122B-A10B
 
 benchmark_model:
-    - meta-llama/Meta-Llama-3-1-8B-Instruct
-    - meta-llama/Meta-Llama-3-1-70B-Instruct
-    - Qwen/Qwen3-32B
+    - meta-llama/Meta-Llama-3.1-8B-Instruct
+    - meta-llama/Meta-Llama-3.1-70B-Instruct
     - Qwen/Qwen3-30B-A3B
     - Qwen/Qwen3-235B-A22B-Thinking-2507
     - Qwen/Qwen2.5-72B-Instruct
     - openai/gpt-oss-120b
     - openai/gpt-oss-20b
     - unsloth/gpt-oss-20b-BF16
-    - unsloth/gpt-oss-120b-BF16
+    - zai-org/GLM-5
+    - Qwen/Qwen3.5-27B
+    - Qwen/Qwen3.5-35B-A3B
+    - Qwen/Qwen3.5-122B-A10B
+    - google/gemma-3-27b-it
 
 evaluate_model:
-    - Qwen/Qwen3-32B
     - Qwen/Qwen3-32B-FP8
     - Qwen/Qwen3-30B-A3B
     - Qwen/Qwen3-30B-A3B-FP8
     - Qwen/Qwen3-235B-A22B-Thinking-2507
     - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
     - openai/gpt-oss-120b
-    - unsloth/gpt-oss-120b-BF16
-    - deepseek/DeepSeek-V3.1
-    - moonshotai/Kimi-K2-Instruct-0905
-    - internlm/Intern-S1-Pro-FP8
-    - JetLM/SDAR-30B-A3B-Sci
+    - deepseek-ai/DeepSeek-V3.1
+    - zai-org/GLM-5
+    - Qwen/Qwen3.5-27B
+    - Qwen/Qwen3.5-35B-A3B
+    - Qwen/Qwen3.5-122B-A10B
 
 mllm_evaluate_model:
     - OpenGVLab/InternVL3_5-38B
diff --git a/autotest/config_h_legacy.yml b/autotest/config_h_legacy.yml
index 02c9f9fcc6..a9d922aad6 100644
--- a/autotest/config_h_legacy.yml
+++ b/autotest/config_h_legacy.yml
@@ -5,14 +5,25 @@ server_log_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autote
 eval_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/evaluation_report
 mllm_eval_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/mllm_evaluation_report
 benchmark_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/benchmark_report
-dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
-prefix_dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/prefix_cache_test.json
+dataset_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+prefix_dataset_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/datasets/prefix_cache_test.json
 env_tag: h
 device: cuda
 
 config:
     tp:
         internlm/Intern-S1: 4
+        internlm/Intern-S1-Pro-FP8: 16
+        JetLM/SDAR-30B-A3B-Sci: 2
+        moonshotai/Kimi-K2-Instruct-0905: 16
+
+    dp_ep:
+        internlm/Intern-S1-Pro-FP8:
+            dp: 16
+            ep: 16
+        moonshotai/Kimi-K2-Instruct-0905:
+            dp: 16
+            ep: 16
 
 turbomind_chat_model:
     tp:
@@ -23,6 +34,11 @@ pytorch_chat_model:
     tp:
         - internlm/Intern-S1
         - internlm/Intern-S1-mini
+        - JetLM/SDAR-30B-A3B-Sci
+        - moonshotai/Kimi-K2-Instruct-0905
+
+    dp_ep:
+        - moonshotai/Kimi-K2-Instruct-0905
 
 turbomind_vl_model:
     tp:
@@ -60,6 +76,7 @@ pytorch_quantization:
     no_kvint4:
         - internlm/Intern-S1
         - internlm/Intern-S1-mini
+        - JetLM/SDAR-30B-A3B-Sci
     no_kvint8:
         - empty
 
@@ -67,6 +84,11 @@ benchmark_model:
     - internlm/Intern-S1
     - internlm/Intern-S1-mini
 
+evaluate_model:
+    - internlm/Intern-S1-Pro-FP8
+    - JetLM/SDAR-30B-A3B-Sci
+    - moonshotai/Kimi-K2-Instruct-0905
+
 mllm_evaluate_model:
     - internlm/Intern-S1
     - internlm/Intern-S1-mini
diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py
index dda08d722c..785b79f668 100644
--- a/autotest/evaluate/test_api_evaluate.py
+++ b/autotest/evaluate/test_api_evaluate.py
@@ -23,6 +23,8 @@ def _run_ray_distributed_test(
         eval_config_name = 'gpt'
     elif 'intern-s1-pro' in run_config.get('model', '').lower():
         eval_config_name = 'intern-s1-pro'
+    elif 'qwen3.5' in run_config.get('model', '').lower():
+        eval_config_name = 'qwen3.5'
     if str(config.get('env_tag')) == 'ascend':
         eval_config_name = f'{eval_config_name}-2batch'
 
@@ -68,6 +70,8 @@ def _run_proxy_distributed_test(config,
         eval_config_name = 'gpt'
     elif 'intern-s1-pro' in run_config.get('model', '').lower():
         eval_config_name = 'intern-s1-pro'
+    elif 'qwen3.5' in run_config.get('model', '').lower():
+        eval_config_name = 'qwen3.5'
 
     if str(config.get('env_tag')) == 'ascend':
         eval_config_name = f'{eval_config_name}-2batch'
@@ -116,6 +120,8 @@ def run_eval_test(config, run_config, worker_id, test_type='infer', eval_config_
         eval_config_name = 'sdar'
     elif 'intern-s1-pro' in run_config.get('model', '').lower():
         eval_config_name = 'intern-s1-pro'
+    elif 'qwen3.5' in run_config.get('model', '').lower():
+        eval_config_name = 'qwen3.5'
     if str(config.get('env_tag')) == 'a100':
         eval_config_name = f'{eval_config_name}-32k'
     elif str(config.get('env_tag')) == 'ascend':
@@ -231,7 +237,7 @@ def test_turbomind_infer_tp8(config, run_config, worker_id):
 
 @pytest.mark.infer
 @pytest.mark.turbomind
-@pytest.mark.gpu_num_cp2tp8
+@pytest.mark.gpu_num_distributed_cp2tp8
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('run_config', get_models('turbomind', {'cp': 2, 'tp': 8}))
 def test_turbomind_infer_cp2tp8(config, run_config, worker_id):
@@ -442,7 +448,7 @@ def test_pytorch_eval_distributed_dpep16(config, run_config, worker_id):
 
 @pytest.mark.eval
 @pytest.mark.turbomind
-@pytest.mark.gpu_num_cp2tp8
+@pytest.mark.gpu_num_distributed_cp2tp8
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.parametrize('run_config', get_models('turbomind', {'cp': 2, 'tp': 8}))
 def test_turbomind_eval_cp2tp8(config, run_config, worker_id):
diff --git a/autotest/tools/common_case_config.py b/autotest/tools/common_case_config.py
index 12334e8815..3eac291a8f 100644
--- a/autotest/tools/common_case_config.py
+++ b/autotest/tools/common_case_config.py
@@ -349,6 +349,20 @@
             'model': 'yuhuili/EAGLE3-LLaMA3.1-Instruct-8B'
         }
     }
+}, {
+    'model': 'zai-org/GLM-4.7-Flash',
+    'communicator': 'nccl',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 2
+    },
+    'extra_params': {
+        'max_batch_size': 128,
+        'speculative_config': {
+            'method': 'deepseek_mtp',
+            'num_speculative_tokens': 3
+        }
+    }
 }]
 
 SPECULATIVE_DECODING_PIPELINE_TEST_LLM = [{
@@ -380,6 +394,18 @@
         'speculative-num-draft-tokens': 3,
         'max-batch-size': 128
     }
+}, {
+    'model': 'zai-org/GLM-4.7-Flash',
+    'communicator': 'nccl',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 2
+    },
+    'extra_params': {
+        'speculative-algorithm': 'deepseek_mtp',
+        'speculative-num-draft-tokens': 3,
+        'max-batch-size': 128
+    }
 }]
 
 SPECULATIVE_DECODING_RESTFUL_TEST_LLM = [{
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index bc41a8156c..6ee27bdd8c 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -105,3 +105,13 @@ def test_pytorch_chat_with_lora_tp2(config, run_config, common_case_config, work
 def test_pipeline_chat_speculative_decoding_tp1(config, run_config, common_case_config, worker_id):
     case_config = {k: v for k, v in common_case_config.items() if k == 'memory_test'}
     run_pipeline_llm_test(config, run_config, case_config, worker_id)
+
+
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize(
+    'run_config', [item for item in SPECULATIVE_DECODING_PIPELINE_TEST_LLM if item['parallel_config'].get('tp') == 2])
+def test_pipeline_chat_speculative_decoding_tp2(config, run_config, common_case_config, worker_id):
+    case_config = {k: v for k, v in common_case_config.items() if k == 'memory_test'}
+    run_pipeline_llm_test(config, run_config, case_config, worker_id)
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index c27822eb47..70be8056e9 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -239,6 +239,16 @@ def test_restful_chat_speculative_decoding_tp1(config, run_config, common_case_c
     run_llm_test(config, run_config, case_config, worker_id)
 
 
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize(
+    'run_config', [item for item in SPECULATIVE_DECODING_RESTFUL_TEST_LLM if item['parallel_config'].get('tp') == 2])
+def test_restful_chat_speculative_decoding_tp2(config, run_config, common_case_config, worker_id):
+    case_config = {k: v for k, v in common_case_config.items() if k == 'memory_test'}
+    run_llm_test(config, run_config, case_config, worker_id)
+
+
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_distributed_tp16
diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 12a8979025..0185492c27 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -1,3 +1,4 @@
+import copy
 import os
 import time
 
@@ -7,6 +8,8 @@
 from utils.config_utils import get_case_str_by_config, get_cli_common_param, get_cuda_prefix_by_workerid, get_workerid
 from utils.run_restful_chat import health_check, start_openai_service, terminate_restful_api
 
+SERVE_ONLY_PARAMS = {'max-batch-size', 'max-prefill-token-num', 'server-name', 'enable-prefix-caching'}
+
 
 def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = False):
     model = run_config.get('model')
@@ -26,7 +29,12 @@ def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = Fa
 
     cuda_prefix = get_cuda_prefix_by_workerid(worker_id, run_config.get('parallel_config'))
 
-    command = f'{cuda_prefix} python3 benchmark/profile_throughput.py {dataset_path} {model_path} {get_cli_common_param(run_config)}'  # noqa
+    bench_config = copy.deepcopy(run_config)
+    bench_config['extra_params'] = {
+        k: v
+        for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS
+    }
+    command = f'{cuda_prefix} python3 benchmark/profile_throughput.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}'  # noqa
 
     if is_smoke:
         num_prompts = '--num-prompts 100'
@@ -72,7 +80,12 @@ def longtext_throughput_test(config, run_config, worker_id: str = ''):
 
     cuda_prefix = get_cuda_prefix_by_workerid(worker_id, run_config.get('parallel_config'))
 
-    command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(run_config)}'  # noqa
+    bench_config = copy.deepcopy(run_config)
+    bench_config['extra_params'] = {
+        k: v
+        for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS
+    }
+    command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}'  # noqa
 
     env = os.environ.copy()
     env.update(run_config.get('env', {}))
@@ -210,10 +223,11 @@ def prefixcache_throughput_test(config, run_config, worker_id: str = '', is_smok
 
     cuda_prefix = get_cuda_prefix_by_workerid(worker_id, run_config.get('parallel_config'))
 
-    run_config_new = run_config.copy()
-    if 'extra_params' not in run_config_new:
-        run_config_new['extra_params'] = {}
-    run_config_new['extra_params'].pop('enable-prefix-caching', None)
+    run_config_new = copy.deepcopy(run_config)
+    run_config_new['extra_params'] = {
+        k: v
+        for k, v in run_config_new.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS
+    }
     run_config_new['extra_params']['session-len'] = 32768
     command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(run_config_new)}'  # noqa
 
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 3d71fe1e0d..16f79dc070 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -102,6 +102,13 @@ def get_func_config_list(backend: str,
             run_config['extra_params']['cache-max-entry-count'] = 0.9
             run_config['extra_params']['max-batch-size'] = 1024
 
+        if 'GLM-5' in run_config['model']:
+            run_config['extra_params']['cache-max-entry-count'] = 0.9
+            run_config['extra_params']['max-batch-size'] = 128
+
+        if 'Qwen3.5' in run_config['model']:
+            run_config['extra_params']['session-len'] = 128000
+
         if config.get('env_tag', '') in ['3090', '5080']:
             run_config['extra_params']['cache-max-entry-count'] = 0.5
 
@@ -128,6 +135,9 @@ def get_func_config_list(backend: str,
                 run_config['extra_params']['max-prefill-token-num'] = 1024
                 run_config['extra_params']['max-batch-size'] = 128
 
+        if 'openai/gpt-oss' in run_config['model']:
+            run_config['extra_params']['model-format'] = 'mxfp4'
+
     return run_configs
 
 
diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py
index 153d3220e7..28d5933963 100644
--- a/autotest/utils/constant.py
+++ b/autotest/utils/constant.py
@@ -136,7 +136,37 @@
             'top_k': 50,
             'min_p': 0.0,
         }
-    }
+    },
+    'qwen3.5': {
+        'query_per_second': 4,
+        'max_out_len': 128000,
+        'max_seq_len': 128000,
+        'batch_size': 500,
+        'temperature': 1.0,
+        'openai_extra_kwargs': {
+            'top_p': 0.95,
+            'presence_penalty': 1.5,
+        },
+        'extra_body': {
+            'top_k': 20,
+            'min_p': 0.0,
+        }
+    },
+    'qwen3.5-2batch': {
+        'query_per_second': 4,
+        'max_out_len': 128000,
+        'max_seq_len': 128000,
+        'batch_size': 2,
+        'temperature': 1.0,
+        'openai_extra_kwargs': {
+            'top_p': 0.95,
+            'presence_penalty': 1.5,
+        },
+        'extra_body': {
+            'top_k': 20,
+            'min_p': 0.0,
+        }
+    },
 }
 
 MLLM_EVAL_CONFIGS = {
diff --git a/autotest/utils/run_client_chat.py b/autotest/utils/run_client_chat.py
index d02a758073..b5a57cdea1 100644
--- a/autotest/utils/run_client_chat.py
+++ b/autotest/utils/run_client_chat.py
@@ -1,3 +1,4 @@
+import copy
 import os
 import time
 from subprocess import PIPE, Popen
@@ -8,6 +9,11 @@
 
 TEMPLATE = 'autotest/template.json'
 
+CHAT_EXCLUDED_PARAMS = {
+    'max-batch-size', 'cache-max-entry-count', 'max-prefill-token-num', 'server-name', 'enable-prefix-caching',
+    'dllm-block-length', 'dllm-denoising-steps', 'dllm-confidence-threshold'
+}
+
 
 def run_tests(config, usercase, cli_case_config, run_config, worker_id):
     if 'coder' in run_config['model'].lower() and usercase == 'chat_testcase':
@@ -28,14 +34,19 @@ def hf_command_line_test(config, case, case_info, run_config, cuda_prefix: str =
     else:
         model_path = os.path.join(config.get('model_path'), model)
 
-    run_config['extra_params']['session_len'] = 4096
+    chat_config = copy.deepcopy(run_config)
+    chat_config['extra_params'] = {
+        k: v
+        for k, v in chat_config.get('extra_params', {}).items() if k not in CHAT_EXCLUDED_PARAMS
+    }
+    chat_config['extra_params']['session_len'] = 4096
     if case == 'base_testcase':
-        run_config['extra_params']['chat_template'] = TEMPLATE
-        run_config['extra_params']['session_len'] = 512
+        chat_config['extra_params']['chat_template'] = TEMPLATE
+        chat_config['extra_params']['session_len'] = 512
 
-    print(run_config)
+    print(chat_config)
 
-    cmd = ' '.join([cuda_prefix, ' '.join(['lmdeploy chat', model_path, get_cli_common_param(run_config)])]).strip()
+    cmd = ' '.join([cuda_prefix, ' '.join(['lmdeploy chat', model_path, get_cli_common_param(chat_config)])]).strip()
 
     result, chat_log, msg = command_test(config, cmd, run_config, case_info, True)
     if chat_log:

From c9d557fe31a7137e42aa2ef3d9ea7ef6a6e37000 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 18 Mar 2026 16:10:43 +0800
Subject: [PATCH 02/10] update config

---
 autotest/config_ascend.yml | 2 +-
 autotest/config_h.yml      | 2 +-
 autotest/config_test.yml   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/autotest/config_ascend.yml b/autotest/config_ascend.yml
index 55a5e7728a..6087805f03 100644
--- a/autotest/config_ascend.yml
+++ b/autotest/config_ascend.yml
@@ -82,7 +82,7 @@ pytorch_quantization:
         - Qwen/Qwen3-VL-8B-Instruct
         - Qwen/Qwen3-VL-32B-Instruct
 
-longtext_model:
+longtext_benchmark_model:
     - Qwen/Qwen3-30B-A3B
 
 benchmark_model:
diff --git a/autotest/config_h.yml b/autotest/config_h.yml
index b0ed3a8f87..df2f88f304 100644
--- a/autotest/config_h.yml
+++ b/autotest/config_h.yml
@@ -253,7 +253,7 @@ pytorch_quantization:
         - Qwen/Qwen3.5-122B-A10B
         - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
 
-longtext_model:
+longtext_benchmark_model:
     - Qwen/Qwen3-30B-A3B
     - Qwen/Qwen3-235B-A22B-Thinking-2507
     - zai-org/GLM-5
diff --git a/autotest/config_test.yml b/autotest/config_test.yml
index 2ac9d56bef..70c3ab4d37 100644
--- a/autotest/config_test.yml
+++ b/autotest/config_test.yml
@@ -167,7 +167,7 @@ pytorch_quantization:
         - test/test_vl_tp1
         - test/test_vl_dpep8
 
-longtext_model:
+longtext_benchmark_model:
     - test/test_tp1
     - test/test_tp1_pytorch
     - test/test_vl_tp2

From 92fba62468f0a77edef75fe89d9998aa5d8edd3b Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Wed, 18 Mar 2026 17:12:16 +0800
Subject: [PATCH 03/10] fix benchmark test

---
 autotest/utils/benchmark_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 0185492c27..c8151b1359 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -8,7 +8,7 @@
 from utils.config_utils import get_case_str_by_config, get_cli_common_param, get_cuda_prefix_by_workerid, get_workerid
 from utils.run_restful_chat import health_check, start_openai_service, terminate_restful_api
 
-SERVE_ONLY_PARAMS = {'max-batch-size', 'max-prefill-token-num', 'server-name', 'enable-prefix-caching'}
+SERVE_ONLY_PARAMS = {'max-batch-size', 'max-prefill-token-num', 'server-name', 'enable-prefix-caching', 'session-len'}  # yapf: disable
 
 
 def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = False):

From 5d7f415760090cbfb227e4fd21f53233b0309852 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 19 Mar 2026 15:57:38 +0800
Subject: [PATCH 04/10] fix gpt-oss args

---
 autotest/utils/benchmark_utils.py | 7 ++++++-
 autotest/utils/config_utils.py    | 3 ---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index c8151b1359..0c8d3a5789 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -8,7 +8,10 @@
 from utils.config_utils import get_case_str_by_config, get_cli_common_param, get_cuda_prefix_by_workerid, get_workerid
 from utils.run_restful_chat import health_check, start_openai_service, terminate_restful_api
 
-SERVE_ONLY_PARAMS = {'max-batch-size', 'max-prefill-token-num', 'server-name', 'enable-prefix-caching', 'session-len'}  # yapf: disable
+SERVE_ONLY_PARAMS = {  # yapf: disable
+    'max-batch-size', 'max-prefill-token-num', 'server-name',
+    'enable-prefix-caching', 'session-len',
+}
 
 
 def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = False):
@@ -34,6 +37,8 @@ def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = Fa
         k: v
         for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS
     }
+    if 'openai/gpt-oss' in run_config.get('model', ''):
+        bench_config['extra_params']['model-format'] = 'mxfp4'
     command = f'{cuda_prefix} python3 benchmark/profile_throughput.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}'  # noqa
 
     if is_smoke:
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 16f79dc070..bbe375f987 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -135,9 +135,6 @@ def get_func_config_list(backend: str,
                 run_config['extra_params']['max-prefill-token-num'] = 1024
                 run_config['extra_params']['max-batch-size'] = 128
 
-        if 'openai/gpt-oss' in run_config['model']:
-            run_config['extra_params']['model-format'] = 'mxfp4'
-
     return run_configs
 
 

From 11424c7c2a4d0761c2de3efbf05f7a4bce52e1b3 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 19 Mar 2026 16:22:44 +0800
Subject: [PATCH 05/10] update

---
 autotest/utils/benchmark_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index 0c8d3a5789..c425ddafa2 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -37,7 +37,7 @@ def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = Fa
         k: v
         for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS
     }
-    if 'openai/gpt-oss' in run_config.get('model', ''):
+    if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind':
         bench_config['extra_params']['model-format'] = 'mxfp4'
     command = f'{cuda_prefix} python3 benchmark/profile_throughput.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}'  # noqa
 
@@ -90,6 +90,8 @@ def longtext_throughput_test(config, run_config, worker_id: str = ''):
         k: v
         for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS
     }
+    if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind':
+        bench_config['extra_params']['model-format'] = 'mxfp4'
     command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}'  # noqa
 
     env = os.environ.copy()
@@ -234,6 +236,8 @@ def prefixcache_throughput_test(config, run_config, worker_id: str = '', is_smok
         for k, v in run_config_new.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS
     }
     run_config_new['extra_params']['session-len'] = 32768
+    if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind':
+        run_config_new['extra_params']['model-format'] = 'mxfp4'
     command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(run_config_new)}'  # noqa
 
     env = os.environ.copy()

From 8671605c3696688dd049421857f4e7f421045e37 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 19 Mar 2026 16:38:45 +0800
Subject: [PATCH 06/10] update vl test config

---
 autotest/config_h.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/autotest/config_h.yml b/autotest/config_h.yml
index df2f88f304..b1d6d8cc3e 100644
--- a/autotest/config_h.yml
+++ b/autotest/config_h.yml
@@ -146,6 +146,9 @@ pytorch_vl_model:
         - Qwen/Qwen3.5-35B-A3B
         - Qwen/Qwen3.5-122B-A10B
         - OpenGVLab/InternVL3_5-38B
+        - Qwen/Qwen3.5-27B
+        - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-122B-A10B
 
 
 turbomind_base_model:
@@ -292,3 +295,6 @@ evaluate_model:
 mllm_evaluate_model:
     - OpenGVLab/InternVL3_5-38B
     - Qwen/Qwen3-VL-30B-A3B-Instruct
+    - Qwen/Qwen3.5-27B
+    - Qwen/Qwen3.5-35B-A3B
+    - Qwen/Qwen3.5-122B-A10B

From a2f1446a0b4fa7905cfea909e6d5c2be24a39e73 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Thu, 19 Mar 2026 17:02:19 +0800
Subject: [PATCH 07/10] update bencmark test

---
 autotest/utils/benchmark_utils.py | 6 ------
 autotest/utils/config_utils.py    | 4 ++++
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py
index c425ddafa2..fffa79a40b 100644
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -37,8 +37,6 @@ def throughput_test(config, run_config, worker_id: str = '', is_smoke: bool = Fa
         k: v
         for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS
     }
-    if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind':
-        bench_config['extra_params']['model-format'] = 'mxfp4'
     command = f'{cuda_prefix} python3 benchmark/profile_throughput.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}'  # noqa
 
     if is_smoke:
@@ -90,8 +88,6 @@ def longtext_throughput_test(config, run_config, worker_id: str = ''):
         k: v
         for k, v in bench_config.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS
     }
-    if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind':
-        bench_config['extra_params']['model-format'] = 'mxfp4'
     command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(bench_config)}'  # noqa
 
     env = os.environ.copy()
@@ -236,8 +232,6 @@ def prefixcache_throughput_test(config, run_config, worker_id: str = '', is_smok
         for k, v in run_config_new.get('extra_params', {}).items() if k not in SERVE_ONLY_PARAMS
     }
     run_config_new['extra_params']['session-len'] = 32768
-    if 'openai/gpt-oss' in run_config.get('model', '') and run_config.get('backend') == 'turbomind':
-        run_config_new['extra_params']['model-format'] = 'mxfp4'
     command = f'{cuda_prefix} python3 benchmark/profile_pipeline_api.py {dataset_path} {model_path} {get_cli_common_param(run_config_new)}'  # noqa
 
     env = os.environ.copy()
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index bbe375f987..b6283ddda7 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -135,6 +135,10 @@ def get_func_config_list(backend: str,
                 run_config['extra_params']['max-prefill-token-num'] = 1024
                 run_config['extra_params']['max-batch-size'] = 128
 
+        if ('openai/gpt-oss' in run_config['model'] and backend == 'turbomind'
+                and func_type in ('benchmark', 'longtext_benchmark')):
+            run_config['extra_params']['model-format'] = 'mxfp4'
+
     return run_configs
 
 

From c6efe39f10cf5cc682c560cdee7add7ab5c49f0c Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Mon, 13 Apr 2026 16:44:44 +0800
Subject: [PATCH 08/10] update config h and add qwen3.5 mtp

---
 autotest/config_ascend.yml                    |  13 +-
 autotest/config_h.yml                         |  63 +++++-
 autotest/config_h_legacy.yml                  |  36 +---
 autotest/evaluate/eval_config_chat.py         |  11 +-
 .../evaluate/eval_config_chat_512_longtext.py | 117 +++++++++++
 .../evaluate/eval_config_chat_longtext.py     | 184 +++++++++++++++++
 autotest/evaluate/test_api_evaluate.py        | 189 ++++++++++++++++--
 autotest/tools/common_case_config.py          |  90 ++++++++-
 autotest/utils/config_utils.py                |   6 +-
 autotest/utils/constant.py                    |  53 +++--
 autotest/utils/evaluate_utils.py              |  19 +-
 11 files changed, 706 insertions(+), 75 deletions(-)
 create mode 100644 autotest/evaluate/eval_config_chat_512_longtext.py
 create mode 100644 autotest/evaluate/eval_config_chat_longtext.py

diff --git a/autotest/config_ascend.yml b/autotest/config_ascend.yml
index 6087805f03..29efc3826d 100644
--- a/autotest/config_ascend.yml
+++ b/autotest/config_ascend.yml
@@ -23,6 +23,12 @@ config:
         Qwen/Qwen3-VL-30B-A3B-Instruct: 4
         Qwen/Qwen3-VL-8B-Instruct: 2
         Qwen/Qwen3-VL-32B-Instruct: 4
+        internlm/Intern-S1-Pro-BF16: 64
+
+    dp_ep:
+        internlm/Intern-S1-Pro-BF16:
+            dp: 64
+            ep: 64
 
 pytorch_chat_model:
     tp:
@@ -31,6 +37,10 @@ pytorch_chat_model:
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-8B
         - Qwen/Qwen3-0.6B
+        - internlm/Intern-S1-Pro-BF16
+
+    dp_ep:
+        - internlm/Intern-S1-Pro-BF16
 
 pytorch_vl_model:
     tp:
@@ -43,7 +53,6 @@ pytorch_vl_model:
         - Qwen/Qwen3-VL-8B-Instruct
         - Qwen/Qwen3-VL-32B-Instruct
 
-
 pytorch_base_model:
     tp:
         - Qwen/Qwen3-0.6B
@@ -67,6 +76,7 @@ pytorch_quantization:
         - Qwen/Qwen3-VL-30B-A3B-Instruct
         - Qwen/Qwen3-VL-8B-Instruct
         - Qwen/Qwen3-VL-32B-Instruct
+        - internlm/Intern-S1-Pro-BF16
     no_kvint8:
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-235B-A22B
@@ -81,6 +91,7 @@ pytorch_quantization:
         - Qwen/Qwen3-VL-30B-A3B-Instruct
         - Qwen/Qwen3-VL-8B-Instruct
         - Qwen/Qwen3-VL-32B-Instruct
+        - internlm/Intern-S1-Pro-BF16
 
 longtext_benchmark_model:
     - Qwen/Qwen3-30B-A3B
diff --git a/autotest/config_h.yml b/autotest/config_h.yml
index b1d6d8cc3e..675f1f6f4b 100644
--- a/autotest/config_h.yml
+++ b/autotest/config_h.yml
@@ -22,7 +22,7 @@ config:
         Qwen/Qwen3-235B-A22B-Thinking-2507: 8
         OpenGVLab/InternVL3_5-38B: 2
         Qwen/Qwen3-VL-30B-A3B-Instruct: 2
-        zai-org/GLM-5: 16
+        zai-org/GLM-5-FP8: 8
         Qwen/Qwen3.5-27B: 2
         Qwen/Qwen3.5-35B-A3B: 2
         Qwen/Qwen3.5-122B-A10B: 4
@@ -35,11 +35,16 @@ config:
         OpenGVLab/InternVL3_5-30B-A3B: 2
         zai-org/GLM-4.7-Flash: 2
         google/gemma-3-27b-it: 2
+        internlm/Intern-S1: 4
+        internlm/Intern-S1-Pro-FP8: 16
 
     dp_ep:
         Qwen/Qwen3-235B-A22B-Thinking-2507:
             dp: 8
             ep: 8
+        internlm/Intern-S1-Pro-FP8:
+            dp: 16
+            ep: 16
 
     cp_tp:
         Qwen/Qwen3-235B-A22B-Thinking-2507:
@@ -76,6 +81,8 @@ turbomind_chat_model:
         - THUDM/glm-4-9b-chat
         - openai/gpt-oss-120b
         - openai/gpt-oss-20b
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
 
     cp_tp:
         - Qwen/Qwen3-235B-A22B-Thinking-2507
@@ -110,17 +117,22 @@ pytorch_chat_model:
         - zai-org/GLM-4.7-Flash
         - microsoft/Phi-3.5-vision-instruct
         - microsoft/Phi-3-vision-128k-instruct
-        - zai-org/GLM-5
+        - zai-org/GLM-5-FP8
         - Qwen/Qwen3.5-27B
         - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-35B-A3B-FP8
         - Qwen/Qwen3.5-122B-A10B
         - deepseek-ai/DeepSeek-V3.1
         - unsloth/gpt-oss-20b-BF16
         - google/gemma-3-27b-it
         - OpenGVLab/InternVL3_5-38B
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - internlm/Intern-S1-Pro-FP8
 
     dp_ep:
         - Qwen/Qwen3-235B-A22B-Thinking-2507
+        - internlm/Intern-S1-Pro-FP8
 
 turbomind_vl_model:
     tp:
@@ -130,6 +142,8 @@ turbomind_vl_model:
         - Qwen/Qwen2.5-VL-7B-Instruct
         - Qwen/Qwen2.5-VL-32B-Instruct
         - OpenGVLab/InternVL3_5-38B
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
 
 pytorch_vl_model:
     tp:
@@ -144,11 +158,10 @@ pytorch_vl_model:
         - microsoft/Phi-3.5-vision-instruct
         - Qwen/Qwen3.5-27B
         - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-35B-A3B-FP8
         - Qwen/Qwen3.5-122B-A10B
-        - OpenGVLab/InternVL3_5-38B
-        - Qwen/Qwen3.5-27B
-        - Qwen/Qwen3.5-35B-A3B
-        - Qwen/Qwen3.5-122B-A10B
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
 
 
 turbomind_base_model:
@@ -170,6 +183,7 @@ turbomind_quantization:
         - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
         - Qwen/Qwen3-32B-FP8
         - Qwen/Qwen3-30B-A3B-FP8
+        - Qwen/Qwen3.5-35B-A3B-FP8
         - Qwen/Qwen3-30B-A3B-Base
         - Qwen/Qwen1.5-MoE-A2.7B-Chat
         - Qwen/Qwen2.5-VL-7B-Instruct
@@ -181,6 +195,8 @@ turbomind_quantization:
         - microsoft/Phi-3-mini-4k-instruct
         - THUDM/glm-4v-9b
         - THUDM/glm-4-9b-chat
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
 
     gptq:
         - empty
@@ -194,6 +210,7 @@ turbomind_quantization:
         - Qwen/Qwen3-32B-FP8
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-30B-A3B-FP8
+        - Qwen/Qwen3.5-35B-A3B-FP8
         - Qwen/Qwen3-235B-A22B-Thinking-2507
         - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
         - Qwen/Qwen2.5-VL-7B-Instruct
@@ -204,6 +221,8 @@ turbomind_quantization:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - openai/gpt-oss-120b
         - openai/gpt-oss-20b
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Chat
         - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
@@ -235,6 +254,7 @@ pytorch_quantization:
         - Qwen/Qwen3-32B-FP8
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-30B-A3B-FP8
+        - Qwen/Qwen3.5-35B-A3B-FP8
         - Qwen/Qwen3-235B-A22B-Thinking-2507
         - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
         - Qwen/Qwen3-VL-8B-Instruct
@@ -242,26 +262,32 @@ pytorch_quantization:
         - microsoft/Phi-3-vision-128k-instruct
         - microsoft/Phi-3.5-vision-instruct
         - zai-org/GLM-4.7-Flash
-        - zai-org/GLM-5
+        - zai-org/GLM-5-FP8
         - Qwen/Qwen3.5-27B
         - Qwen/Qwen3.5-35B-A3B
         - Qwen/Qwen3.5-122B-A10B
         - deepseek-ai/DeepSeek-V3.1
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - internlm/Intern-S1-Pro-FP8
     no_kvint8:
         - zai-org/GLM-4.7-Flash
-        - zai-org/GLM-5
+        - zai-org/GLM-5-FP8
         - deepseek-ai/DeepSeek-V3.1
         - Qwen/Qwen3.5-27B
         - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-35B-A3B-FP8
         - Qwen/Qwen3.5-122B-A10B
         - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
+        - internlm/Intern-S1-Pro-FP8
 
 longtext_benchmark_model:
     - Qwen/Qwen3-30B-A3B
     - Qwen/Qwen3-235B-A22B-Thinking-2507
-    - zai-org/GLM-5
+    - zai-org/GLM-5-FP8
     - Qwen/Qwen3.5-27B
     - Qwen/Qwen3.5-35B-A3B
+    - Qwen/Qwen3.5-35B-A3B-FP8
     - Qwen/Qwen3.5-122B-A10B
 
 benchmark_model:
@@ -273,11 +299,14 @@ benchmark_model:
     - openai/gpt-oss-120b
     - openai/gpt-oss-20b
     - unsloth/gpt-oss-20b-BF16
-    - zai-org/GLM-5
+    - zai-org/GLM-5-FP8
     - Qwen/Qwen3.5-27B
     - Qwen/Qwen3.5-35B-A3B
+    - Qwen/Qwen3.5-35B-A3B-FP8
     - Qwen/Qwen3.5-122B-A10B
     - google/gemma-3-27b-it
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
 
 evaluate_model:
     - Qwen/Qwen3-32B-FP8
@@ -287,14 +316,26 @@ evaluate_model:
     - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
     - openai/gpt-oss-120b
     - deepseek-ai/DeepSeek-V3.1
-    - zai-org/GLM-5
+    - zai-org/GLM-5-FP8
     - Qwen/Qwen3.5-27B
     - Qwen/Qwen3.5-35B-A3B
+    - Qwen/Qwen3.5-35B-A3B-FP8
     - Qwen/Qwen3.5-122B-A10B
+    - internlm/Intern-S1-Pro-FP8
+
+longtext_evaluate_model:
+    - Qwen/Qwen3.5-35B-A3B
+
+mtp_evaluate_model:
+    - Qwen/Qwen3.5-35B-A3B
+    - Qwen/Qwen3.5-35B-A3B-FP8
 
 mllm_evaluate_model:
     - OpenGVLab/InternVL3_5-38B
     - Qwen/Qwen3-VL-30B-A3B-Instruct
     - Qwen/Qwen3.5-27B
     - Qwen/Qwen3.5-35B-A3B
+    - Qwen/Qwen3.5-35B-A3B-FP8
     - Qwen/Qwen3.5-122B-A10B
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
diff --git a/autotest/config_h_legacy.yml b/autotest/config_h_legacy.yml
index a9d922aad6..c9f80a3f49 100644
--- a/autotest/config_h_legacy.yml
+++ b/autotest/config_h_legacy.yml
@@ -12,28 +12,20 @@ device: cuda
 
 config:
     tp:
-        internlm/Intern-S1: 4
-        internlm/Intern-S1-Pro-FP8: 16
         JetLM/SDAR-30B-A3B-Sci: 2
         moonshotai/Kimi-K2-Instruct-0905: 16
 
     dp_ep:
-        internlm/Intern-S1-Pro-FP8:
-            dp: 16
-            ep: 16
         moonshotai/Kimi-K2-Instruct-0905:
             dp: 16
             ep: 16
 
 turbomind_chat_model:
     tp:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
+        - empty
 
 pytorch_chat_model:
     tp:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
         - JetLM/SDAR-30B-A3B-Sci
         - moonshotai/Kimi-K2-Instruct-0905
 
@@ -42,29 +34,27 @@ pytorch_chat_model:
 
 turbomind_vl_model:
     tp:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
+        - empty
 
 pytorch_vl_model:
     tp:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
+        - empty
 
 turbomind_base_model:
     tp:
+        - empty
 
 pytorch_base_model:
     tp:
+        - empty
 
 turbomind_quantization:
     no_awq:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
+        - empty
     gptq:
         - empty
     no_kvint4:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
+        - empty
     no_kvint8:
         - empty
 
@@ -74,21 +64,17 @@ pytorch_quantization:
     w8a8:
         - empty
     no_kvint4:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
         - JetLM/SDAR-30B-A3B-Sci
+        - moonshotai/Kimi-K2-Instruct-0905
     no_kvint8:
-        - empty
+        - moonshotai/Kimi-K2-Instruct-0905
 
 benchmark_model:
-    - internlm/Intern-S1
-    - internlm/Intern-S1-mini
+    - empty
 
 evaluate_model:
-    - internlm/Intern-S1-Pro-FP8
     - JetLM/SDAR-30B-A3B-Sci
     - moonshotai/Kimi-K2-Instruct-0905
 
 mllm_evaluate_model:
-    - internlm/Intern-S1
-    - internlm/Intern-S1-mini
+    - empty
diff --git a/autotest/evaluate/eval_config_chat.py b/autotest/evaluate/eval_config_chat.py
index 0b29046dcf..76dbfc6618 100644
--- a/autotest/evaluate/eval_config_chat.py
+++ b/autotest/evaluate/eval_config_chat.py
@@ -4,7 +4,7 @@
 from opencompass.models import OpenAISDK
 from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
 from opencompass.runners import LocalRunner
-from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferConcurrentTask
 from opencompass.utils.text_postprocessors import extract_non_reasoning_content
 
 #######################################################################
@@ -48,6 +48,7 @@
          run_cfg=dict(num_gpus=0),
          meta_template=api_meta_template,
          timeout=10800,
+         max_workers=1024,
          pred_postprocessor=dict(type=extract_non_reasoning_content))
 ]
 
@@ -128,15 +129,13 @@
     if 'max_out_len' in item['infer_cfg']['inferencer']:
         del item['infer_cfg']['inferencer']['max_out_len']
 
-NUM_WORKERS = 8
-
 infer = dict(
-    partitioner=dict(type=NumWorkerPartitioner, num_worker=NUM_WORKERS),
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
     runner=dict(
         type=LocalRunner,
         max_num_workers=64,
         retry=0,
-        task=dict(type=OpenICLInferTask),
+        task=dict(type=OpenICLInferConcurrentTask),
     ),
 )
 
@@ -145,5 +144,3 @@
     partitioner=dict(type=NaivePartitioner, n=10),
     runner=dict(type=LocalRunner, max_num_workers=64, task=dict(type=OpenICLEvalTask)),
 )
-
-infer['partitioner']['num_worker'] = 64
diff --git a/autotest/evaluate/eval_config_chat_512_longtext.py b/autotest/evaluate/eval_config_chat_512_longtext.py
new file mode 100644
index 0000000000..139fd1a655
--- /dev/null
+++ b/autotest/evaluate/eval_config_chat_512_longtext.py
@@ -0,0 +1,117 @@
+# flake8: noqa
+
+from mmengine.config import read_base
+from opencompass.models import OpenAISDK
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferConcurrentTask
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    from opencompass.configs.datasets.ruler.ruler_512k_gen import (
+        ruler_datasets as ruler_512k_datasets,
+    )
+    from opencompass.configs.summarizers.groups.ruler import (
+        ruler_summary_groups as _ruler_summary_groups_all,
+    )
+
+ruler_summary_groups = [
+    g for g in _ruler_summary_groups_all if g.get('name') == 'ruler_512k'
+]
+
+#######################################################################
+#                         Model Configuration                         #
+#######################################################################
+
+MODEL_NAME = ''
+MODEL_PATH = ''
+API_BASE = ''
+JUDGE_MODEL_NAME = ''
+JUDGE_MODEL_PATH = ''
+JUDGE_API_BASE = ''
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+
+# Use OpenAISDK to configure LMDeploy OpenAI interface
+models = [
+    dict(type=OpenAISDK,
+         abbr=f'{MODEL_NAME}',
+         path=MODEL_PATH,
+         key='EMPTY',
+         openai_api_base=API_BASE,
+         retry=3,
+         run_cfg=dict(num_gpus=0),
+         meta_template=api_meta_template,
+         timeout=10800,
+         max_workers=1024,
+         pred_postprocessor=dict(type=extract_non_reasoning_content))
+]
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+datasets = list(ruler_512k_datasets)
+
+judge_cfg = dict(
+    type=OpenAISDK,
+    abbr=f'{JUDGE_MODEL_NAME}',
+    path=JUDGE_MODEL_NAME,
+    key='EMPTY',
+    openai_api_base=JUDGE_API_BASE,
+    meta_template=dict(round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]),
+    query_per_second=16,
+    batch_size=1024,
+    temperature=0.001,
+    tokenizer_path=JUDGE_MODEL_PATH,
+    verbose=True,
+    max_out_len=8192,
+    max_seq_len=32768,
+    mode='mid',
+)
+
+for item in datasets:
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+    if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys(
+    ) and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
+        item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
+
+#######################################################################
+#                       PART 2  Dataset Summarizer                    #
+#######################################################################
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['ruler_512k', 'naive_average'],
+    ],
+    summary_groups=ruler_summary_groups,
+)
+
+for item in datasets:
+    if 'max_out_len' in item['infer_cfg']['inferencer']:
+        del item['infer_cfg']['inferencer']['max_out_len']
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=64,
+        retry=0,
+        task=dict(type=OpenICLInferConcurrentTask),
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner, max_num_workers=64, task=dict(type=OpenICLEvalTask)),
+)
diff --git a/autotest/evaluate/eval_config_chat_longtext.py b/autotest/evaluate/eval_config_chat_longtext.py
new file mode 100644
index 0000000000..f9ba80e263
--- /dev/null
+++ b/autotest/evaluate/eval_config_chat_longtext.py
@@ -0,0 +1,184 @@
+# flake8: noqa
+
+from mmengine.config import read_base
+from opencompass.models import OpenAISDK
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferConcurrentTask
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets
+    from opencompass.configs.datasets.needlebench.needlebench_8k.needlebench_8k import (
+        needlebench_datasets as needlebench_8k_datasets,
+    )
+    from opencompass.configs.datasets.needlebench.needlebench_32k.needlebench_32k import (
+        needlebench_datasets as needlebench_32k_datasets,
+    )
+    from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import (
+        needlebench_datasets as needlebench_128k_datasets,
+    )
+    from opencompass.configs.datasets.ruler.ruler_8k_gen import (
+        ruler_datasets as ruler_8k_datasets,
+    )
+    from opencompass.configs.datasets.ruler.ruler_32k_gen import (
+        ruler_datasets as ruler_32k_datasets,
+    )
+    from opencompass.configs.datasets.ruler.ruler_64k_gen import (
+        ruler_datasets as ruler_64k_datasets,
+    )
+    from opencompass.configs.datasets.ruler.ruler_128k_gen import (
+        ruler_datasets as ruler_128k_datasets,
+    )
+    from opencompass.configs.datasets.ruler.ruler_256k_gen import (
+        ruler_datasets as ruler_256k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_0k_gen import (
+        babiLong_0k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_4k_gen import (
+        babiLong_4k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_16k_gen import (
+        babiLong_16k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_32k_gen import (
+        babiLong_32k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_128k_gen import (
+        babiLong_128k_datasets,
+    )
+    from opencompass.configs.datasets.babilong.babilong_256k_gen import (
+        babiLong_256k_datasets,
+    )
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.babilong import (
+        babilong_summary_groups,
+    )
+    from opencompass.configs.summarizers.groups.ruler import (
+        ruler_summary_groups,
+    )
+    from opencompass.configs.summarizers.needlebench import (
+        needlebench_8k_summarizer,
+        needlebench_32k_summarizer,
+        needlebench_128k_summarizer,
+    )
+
+ruler_summary_groups = [
+    g for g in ruler_summary_groups if g.get('name') != 'ruler_512k'
+]
+
+#######################################################################
+#                         Model Configuration                         #
+#######################################################################
+
+MODEL_NAME = ''
+MODEL_PATH = ''
+API_BASE = ''
+JUDGE_MODEL_NAME = ''
+JUDGE_MODEL_PATH = ''
+JUDGE_API_BASE = ''
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+
+# Use OpenAISDK to configure LMDeploy OpenAI interface
+models = [
+    dict(type=OpenAISDK,
+         abbr=f'{MODEL_NAME}',
+         path=MODEL_PATH,
+         key='EMPTY',
+         openai_api_base=API_BASE,
+         retry=3,
+         run_cfg=dict(num_gpus=0),
+         meta_template=api_meta_template,
+         timeout=10800,
+         max_workers=1024,
+         pred_postprocessor=dict(type=extract_non_reasoning_content))
+]
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+needlebench_8k_summary_groups = needlebench_8k_summarizer["summary_groups"]
+needlebench_32k_summary_groups = needlebench_32k_summarizer["summary_groups"]
+needlebench_128k_summary_groups = needlebench_128k_summarizer["summary_groups"]
+
+# LLM judge config: using LLM to evaluate predictions
+judge_cfg = dict(
+    type=OpenAISDK,
+    abbr=f'{JUDGE_MODEL_NAME}',
+    path=JUDGE_MODEL_NAME,
+    key='EMPTY',
+    openai_api_base=JUDGE_API_BASE,
+    meta_template=dict(round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]),
+    query_per_second=16,
+    batch_size=1024,
+    temperature=0.001,
+    tokenizer_path=JUDGE_MODEL_PATH,
+    verbose=True,
+    max_out_len=8192,
+    max_seq_len=32768,
+    mode='mid',
+)
+
+for item in datasets:
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+    if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys(
+    ) and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
+        item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
+
+#######################################################################
+#                       PART 2  Dataset Summarizer                    #
+#######################################################################
+
+summarizer = dict(
+    dataset_abbrs=[
+        ["ruler_8k", "naive_average"],
+        ["ruler_32k", "naive_average"],
+        ["ruler_64k", "naive_average"],
+        ["ruler_128k", "naive_average"],
+        ["ruler_256k", "naive_average"],
+        ["NeedleBench-Overall-Score-8K", "weighted_average"],
+        ["NeedleBench-Overall-Score-32K", "weighted_average"],
+        ["NeedleBench-Overall-Score-128K", "weighted_average"],
+        ['babilong_0k', 'naive_average'],
+        ['babilong_4k', 'naive_average'],
+        ['babilong_16k', 'naive_average'],
+        ['babilong_32k', 'naive_average'],
+        ['babilong_128k', 'naive_average'],
+        ['babilong_256k', 'naive_average'],
+    ],
+    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+for item in datasets:
+    if 'max_out_len' in item['infer_cfg']['inferencer']:
+        del item['infer_cfg']['inferencer']['max_out_len']
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=64,
+        retry=0,
+        task=dict(type=OpenICLInferConcurrentTask),
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner, max_num_workers=64, task=dict(type=OpenICLEvalTask)),
+)
diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py
index 785b79f668..7877b29c04 100644
--- a/autotest/evaluate/test_api_evaluate.py
+++ b/autotest/evaluate/test_api_evaluate.py
@@ -1,3 +1,4 @@
+import copy
 import os
 import time
 
@@ -112,26 +113,41 @@ def _run_proxy_distributed_test(config,
             time.sleep(1)
 
 
-def run_eval_test(config, run_config, worker_id, test_type='infer', eval_config_name='default'):
+def run_eval_test(config, run_config, worker_id, test_type='infer', eval_config_name='default', eval_subpath=None):
     """Run test with specified evaluation configuration."""
-    if 'gpt' in run_config.get('model', '').lower():
-        eval_config_name = 'gpt'
-    elif 'sdar' in run_config.get('model', '').lower():
-        eval_config_name = 'sdar'
-    elif 'intern-s1-pro' in run_config.get('model', '').lower():
-        eval_config_name = 'intern-s1-pro'
-    elif 'qwen3.5' in run_config.get('model', '').lower():
-        eval_config_name = 'qwen3.5'
-    if str(config.get('env_tag')) == 'a100':
-        eval_config_name = f'{eval_config_name}-32k'
-    elif str(config.get('env_tag')) == 'ascend':
-        eval_config_name = f'{eval_config_name}-2batch'
+    if eval_config_name == 'default':
+        longtext_key = run_config.get('_longtext_eval_config_name')
+        if longtext_key:
+            eval_config_name = longtext_key
+        else:
+            if 'gpt' in run_config.get('model', '').lower():
+                eval_config_name = 'gpt'
+            elif 'sdar' in run_config.get('model', '').lower():
+                eval_config_name = 'sdar'
+            elif 'intern-s1-pro' in run_config.get('model', '').lower():
+                eval_config_name = 'intern-s1-pro'
+            elif 'qwen3.5' in run_config.get('model', '').lower():
+                eval_config_name = 'qwen3.5'
+            if str(config.get('env_tag')) == 'a100':
+                eval_config_name = f'{eval_config_name}-32k'
+            elif str(config.get('env_tag')) == 'ascend':
+                eval_config_name = f'{eval_config_name}-2batch'
     preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})
     eval_path = config.get('eval_path')
+    if eval_subpath:
+        rel = eval_subpath
+        nested = run_config.get('_eval_path_subdir')
+        if nested:
+            rel = os.path.join(rel, nested)
+        eval_path = os.path.join(eval_path, rel)
+        os.makedirs(eval_path, exist_ok=True)
 
     total_gpus = int(os.environ.get('TOTAL_GPU_COUNT', '8'))
     work_num = int(total_gpus / run_config.get('parallel_config', {}).get('tp', 1))
+
+    # Set max-num-workers to 8 for qwen3.5 models
     extra_config = {'max-num-workers': min(work_num * 16, 64)}
+
     case_name = get_case_str_by_config(run_config)
 
     if test_type == 'infer':
@@ -163,6 +179,7 @@ def run_openai_service_start(i):
                       port=constant.PROXY_PORT,
                       test_type=test_type,
                       extra_config=extra_config,
+                      eval_config_name=eval_config_name,
                       **preset_config)
         finally:
             for i in range(work_num):
@@ -186,6 +203,7 @@ def run_openai_service_start(i):
                           port=port,
                           test_type=test_type,
                           extra_config=extra_config,
+                          eval_config_name=eval_config_name,
                           **preset_config)
             else:
                 assert False, f'Failed to start RESTful API server: {content}'
@@ -195,8 +213,73 @@ def run_openai_service_start(i):
             stop_restful_api(proxy_pid, proxy_process)
 
 
-def get_models(backend, parallel_config):
-    return get_func_config_list(backend, parallel_config, func_type='evaluate', extra={'session_len': 65536})
+def get_models(backend, parallel_config, session_len='auto'):
+    if session_len == 'auto':
+        configs = get_func_config_list(backend, parallel_config, func_type='evaluate', extra={})
+        result = []
+        for config in configs:
+            model = config.get('model', '')
+            if 'Qwen3.5' not in model:
+                if 'extra_params' not in config:
+                    config['extra_params'] = {}
+                config['extra_params']['session_len'] = 65536
+            result.append(config)
+        return result
+    else:
+        extra = {'session_len': session_len} if session_len is not None else {}
+        return get_func_config_list(backend, parallel_config, func_type='evaluate', extra=extra)
+
+
+def _resolve_longtext_eval_config_name(run_config: dict) -> str | None:
+    """Map longtext_evaluate config to EVAL_CONFIGS key; add branches when new
+    longtext families ship."""
+    ep = run_config.get('extra_params') or {}
+    raw = ep.get('session_len', ep.get('session-len'))
+    if raw is None:
+        return None
+    try:
+        sl = int(raw)
+    except (TypeError, ValueError):
+        return None
+    model_lower = (run_config.get('model') or '').lower()
+    if 'qwen3.5' in model_lower:
+        if sl >= 600000:
+            return 'longtext-512k'
+        if sl >= 300000:
+            return 'longtext-256k'
+    return None
+
+
+def get_longtext_models(backend, parallel_config, session_len='auto'):
+    if session_len == 'auto':
+        session_len = 65536
+    extra = {'session_len': session_len} if session_len is not None else {}
+    configs = get_func_config_list(backend, parallel_config, func_type='longtext_evaluate', extra=extra)
+    for cfg in configs:
+        preset_key = _resolve_longtext_eval_config_name(cfg)
+        if preset_key:
+            cfg['_longtext_eval_config_name'] = preset_key
+    return configs
+
+
+def get_mtp_models(backend, parallel_config):
+    base_configs = get_func_config_list(backend, parallel_config, func_type='mtp_evaluate', extra={})
+    for cfg in base_configs:
+        if 'qwen3.5' in cfg.get('model', '').lower():
+            cfg['extra_params'].update(constant.QWEN35_MTP_SERVER_EXTRA)
+
+    result_configs = []
+    for config in base_configs:
+        result_configs.append(config)
+
+        if config.get('model') == 'Qwen/Qwen3.5-35B-A3B' and parallel_config.get('tp') == 2:
+            fp8_config = copy.deepcopy(config)
+            fp8_config['extra_params']['max-prefill-token-num'] = 1024
+            fp8_config['extra_params']['model-format'] = 'fp8'
+            fp8_config['_eval_path_subdir'] = 'serve_fp8'
+            result_configs.append(fp8_config)
+
+    return result_configs
 
 
 @pytest.mark.infer
@@ -264,6 +347,64 @@ def test_pytorch_restful_tp2(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
 
+@pytest.mark.infer
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=400000))
+def test_pytorch_restful_tp2_longtext(config, run_config, worker_id):
+    run_eval_test(config, run_config, worker_id, 'infer')
+
+
+@pytest.mark.infer
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=700000))
+def test_pytorch_restful_tp2_longtext_512k(config, run_config, worker_id):
+    run_eval_test(config, run_config, worker_id, 'infer')
+
+
+@pytest.mark.infer
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_2
+@pytest.mark.mtp
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 2}))
+def test_pytorch_restful_tp2_mtp(config, run_config, worker_id):
+    run_eval_test(config, run_config, worker_id, 'infer', eval_subpath='mtp')
+
+
+@pytest.mark.infer
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_1
+@pytest.mark.mtp
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 1}))
+def test_pytorch_restful_tp1_mtp(config, run_config, worker_id):
+    run_eval_test(config, run_config, worker_id, 'infer', eval_subpath='mtp')
+
+
+@pytest.mark.eval
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_2
+@pytest.mark.mtp
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 2}))
+def test_pytorch_eval_tp2_mtp(config, run_config, worker_id):
+    run_eval_test(config, run_config, worker_id, 'eval', eval_subpath='mtp')
+
+
+@pytest.mark.eval
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_1
+@pytest.mark.mtp
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 1}))
+def test_pytorch_eval_tp1_mtp(config, run_config, worker_id):
+    run_eval_test(config, run_config, worker_id, 'eval', eval_subpath='mtp')
+
+
 @pytest.mark.infer
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_4
@@ -446,6 +587,24 @@ def test_pytorch_eval_distributed_dpep16(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
 
+@pytest.mark.eval
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=400000))
+def test_pytorch_eval_tp2_longtext(config, run_config, worker_id):
+    run_eval_test(config, run_config, worker_id, 'eval')
+
+
+@pytest.mark.eval
+@pytest.mark.pytorch
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=700000))
+def test_pytorch_eval_tp2_longtext_512k(config, run_config, worker_id):
+    run_eval_test(config, run_config, worker_id, 'eval')
+
+
 @pytest.mark.eval
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_distributed_cp2tp8
diff --git a/autotest/tools/common_case_config.py b/autotest/tools/common_case_config.py
index 3eac291a8f..f32bbc82a0 100644
--- a/autotest/tools/common_case_config.py
+++ b/autotest/tools/common_case_config.py
@@ -363,6 +363,53 @@
             'num_speculative_tokens': 3
         }
     }
+}, {
+    'model': 'Qwen/Qwen3.5-35B-A3B',
+    'communicator': 'nccl',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 2
+    },
+    'extra_params': {
+        'max_batch_size': 256,
+        'reasoning_parser': 'qwen-qwq',
+        'speculative_config': {
+            'method': 'qwen3_5_mtp',
+            'num_speculative_tokens': 4
+        }
+    }
+}, {
+    'model': 'Qwen/Qwen3.5-35B-A3B',
+    'communicator': 'nccl',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 2
+    },
+    'extra_params': {
+        'max_batch_size': 256,
+        'reasoning_parser': 'qwen-qwq',
+        'max_prefill_token_num': 1024,
+        'model_format': 'fp8',
+        'speculative_config': {
+            'method': 'qwen3_5_mtp',
+            'num_speculative_tokens': 4
+        }
+    }
+}, {
+    'model': 'Qwen/Qwen3.5-35B-A3B-FP8',
+    'communicator': 'nccl',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 1
+    },
+    'extra_params': {
+        'max_batch_size': 256,
+        'reasoning_parser': 'qwen-qwq',
+        'speculative_config': {
+            'method': 'qwen3_5_mtp',
+            'num_speculative_tokens': 4
+        }
+    }
 }]
 
 SPECULATIVE_DECODING_PIPELINE_TEST_LLM = [{
@@ -383,7 +430,7 @@
         'max-batch-size': 128
     }
 }, {
-    'model': 'deepseek/DeepSeek-V3',
+    'model': 'deepseek-ai/DeepSeek-V3',
     'communicator': 'nccl',
     'quant_policy': 0,
     'parallel_config': {
@@ -406,6 +453,47 @@
         'speculative-num-draft-tokens': 3,
         'max-batch-size': 128
     }
+}, {
+    'model': 'Qwen/Qwen3.5-35B-A3B',
+    'communicator': 'nccl',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 2
+    },
+    'extra_params': {
+        'reasoning-parser': 'qwen-qwq',
+        'speculative-algorithm': 'qwen3_5_mtp',
+        'speculative-num-draft-tokens': 4,
+        'max-batch-size': 256
+    }
+}, {
+    'model': 'Qwen/Qwen3.5-35B-A3B',
+    'communicator': 'nccl',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 2
+    },
+    'extra_params': {
+        'reasoning-parser': 'qwen-qwq',
+        'speculative-algorithm': 'qwen3_5_mtp',
+        'speculative-num-draft-tokens': 4,
+        'max-batch-size': 256,
+        'max-prefill-token-num': 1024,
+        'model-format': 'fp8'
+    }
+}, {
+    'model': 'Qwen/Qwen3.5-35B-A3B-FP8',
+    'communicator': 'nccl',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 1
+    },
+    'extra_params': {
+        'reasoning-parser': 'qwen-qwq',
+        'speculative-algorithm': 'qwen3_5_mtp',
+        'speculative-num-draft-tokens': 4,
+        'max-batch-size': 256
+    }
 }]
 
 SPECULATIVE_DECODING_RESTFUL_TEST_LLM = [{
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index e1ea067cc3..82883d189d 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -101,10 +101,14 @@ def get_func_config_list(backend: str,
         if 'Qwen3-235B-A22B-Thinking-2507' in run_config['model']:
             run_config['extra_params']['cache-max-entry-count'] = 0.9
             run_config['extra_params']['max-batch-size'] = 1024
+            para_conf = run_config.get('parallel_config', {})
+            if para_conf.get('dp', 0) == 8 and para_conf.get('ep', 0) == 8:
+                run_config['extra_params']['max-batch-size'] = 256
 
-        if 'GLM-5' in run_config['model']:
+        if 'GLM-5-FP8' in run_config['model']:
             run_config['extra_params']['cache-max-entry-count'] = 0.9
             run_config['extra_params']['max-batch-size'] = 128
+            run_config['extra_params']['model-format'] = 'fp8'
 
         if 'Qwen3.5' in run_config['model']:
             run_config['extra_params']['session-len'] = 128000
diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py
index 28d5933963..1ac6d47d1d 100644
--- a/autotest/utils/constant.py
+++ b/autotest/utils/constant.py
@@ -141,16 +141,40 @@
         'query_per_second': 4,
         'max_out_len': 128000,
         'max_seq_len': 128000,
-        'batch_size': 500,
+        'batch_size': 32,
         'temperature': 1.0,
-        'openai_extra_kwargs': {
+        'extra_body': {
+            'top_k': 20,
+            'repetition_penalty': 1.0,
             'top_p': 0.95,
-            'presence_penalty': 1.5,
+            'chat_template_kwargs': {'enable_thinking': True},
         },
+    },
+    'longtext-256k': {
+        'query_per_second': 4,
+        'max_out_len': 280000,
+        'max_seq_len': 400000,
+        'batch_size': 32,
+        'temperature': 1.0,
         'extra_body': {
             'top_k': 20,
-            'min_p': 0.0,
-        }
+            'repetition_penalty': 1.0,
+            'top_p': 0.95,
+            'chat_template_kwargs': {'enable_thinking': True},
+        },
+    },
+    'longtext-512k': {
+        'query_per_second': 4,
+        'max_out_len': 700000,
+        'max_seq_len': 700000,
+        'batch_size': 32,
+        'temperature': 1.0,
+        'extra_body': {
+            'top_k': 20,
+            'repetition_penalty': 1.0,
+            'top_p': 0.95,
+            'chat_template_kwargs': {'enable_thinking': True},
+        },
     },
     'qwen3.5-2batch': {
         'query_per_second': 4,
@@ -158,14 +182,12 @@
         'max_seq_len': 128000,
         'batch_size': 2,
         'temperature': 1.0,
-        'openai_extra_kwargs': {
-            'top_p': 0.95,
-            'presence_penalty': 1.5,
-        },
         'extra_body': {
             'top_k': 20,
-            'min_p': 0.0,
-        }
+            'repetition_penalty': 1.0,
+            'top_p': 0.95,
+            'chat_template_kwargs': {'enable_thinking': True},
+        },
     },
 }
 
@@ -181,11 +203,18 @@
 
 BACKEND_LIST = ['turbomind', 'pytorch']
 
+QWEN35_MTP_SERVER_EXTRA = {
+    'reasoning-parser': 'qwen-qwq',
+    'speculative-algorithm': 'qwen3_5_mtp',
+    'speculative-num-draft-tokens': 4,
+    'max-batch-size': 256,
+}
+
 RESTFUL_MODEL_LIST = [
     'Qwen/Qwen3-0.6B', 'Qwen/Qwen3-VL-2B-Instruct', 'Qwen/Qwen3-30B-A3B', 'internlm/Intern-S1',
     'internlm/internlm2_5-20b', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', 'OpenGVLab/InternVL3-38B',
     'Qwen/Qwen3-VL-8B-Instruct', 'internlm/internlm3-8b-instruct', 'meta-llama/Llama-3.2-3B-Instruct',
-    'Qwen/Qwen3-VL-30B-A3B-Instruct'
+    'Qwen/Qwen3-VL-30B-A3B-Instruct', 'Qwen/Qwen3.5-35B-A3B', 'Qwen/Qwen3.5-35B-A3B-FP8', 'Qwen/Qwen3.5-122B-A10B'
 ]
 
 RESTFUL_BASE_MODEL_LIST = [
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 8535e805bf..07025b9b56 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -143,7 +143,16 @@ def mllm_summary(case_name,
     write_to_summary(case_name, result, msg, metrics, result_dir)
 
 
-def eval_test(model_path, eval_path, case_name, port=DEFAULT_PORT, test_type='infer', extra_config={}, **kwargs):
+def eval_test(model_path,
+              eval_path,
+              case_name,
+              port=DEFAULT_PORT,
+              test_type='infer',
+              extra_config=None,
+              eval_config_name='default',
+              **kwargs):
+    if extra_config is None:
+        extra_config = {}
     work_dir = None
     try:
 
@@ -154,7 +163,13 @@ def eval_test(model_path, eval_path, case_name, port=DEFAULT_PORT, test_type='in
 
         current_dir = os.path.dirname(os.path.abspath(__file__))
         parent_dir = os.path.dirname(current_dir)
-        config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py')
+
+        if eval_config_name == 'longtext-512k':
+            config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat_512_longtext.py')
+        elif eval_config_name == 'longtext-256k':
+            config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat_longtext.py')
+        else:
+            config_file = os.path.join(parent_dir, 'evaluate/eval_config_chat.py')
 
         print(f'Starting OpenCompass evaluation for model: {model_path}')
         print(f'Model path: {model_path}')

From 686cdc5205792478571cdde839156476c0744139 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Mon, 13 Apr 2026 16:53:58 +0800
Subject: [PATCH 09/10] fix lint

---
 .../evaluate/eval_config_chat_longtext.py     | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/autotest/evaluate/eval_config_chat_longtext.py b/autotest/evaluate/eval_config_chat_longtext.py
index f9ba80e263..bfd3176406 100644
--- a/autotest/evaluate/eval_config_chat_longtext.py
+++ b/autotest/evaluate/eval_config_chat_longtext.py
@@ -107,9 +107,9 @@
 #######################################################################
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
 
-needlebench_8k_summary_groups = needlebench_8k_summarizer["summary_groups"]
-needlebench_32k_summary_groups = needlebench_32k_summarizer["summary_groups"]
-needlebench_128k_summary_groups = needlebench_128k_summarizer["summary_groups"]
+needlebench_8k_summary_groups = needlebench_8k_summarizer['summary_groups']
+needlebench_32k_summary_groups = needlebench_32k_summarizer['summary_groups']
+needlebench_128k_summary_groups = needlebench_128k_summarizer['summary_groups']
 
 # LLM judge config: using LLM to evaluate predictions
 judge_cfg = dict(
@@ -145,14 +145,14 @@
 
 summarizer = dict(
     dataset_abbrs=[
-        ["ruler_8k", "naive_average"],
-        ["ruler_32k", "naive_average"],
-        ["ruler_64k", "naive_average"],
-        ["ruler_128k", "naive_average"],
-        ["ruler_256k", "naive_average"],
-        ["NeedleBench-Overall-Score-8K", "weighted_average"],
-        ["NeedleBench-Overall-Score-32K", "weighted_average"],
-        ["NeedleBench-Overall-Score-128K", "weighted_average"],
+        ['ruler_8k', 'naive_average'],
+        ['ruler_32k', 'naive_average'],
+        ['ruler_64k', 'naive_average'],
+        ['ruler_128k', 'naive_average'],
+        ['ruler_256k', 'naive_average'],
+        ['NeedleBench-Overall-Score-8K', 'weighted_average'],
+        ['NeedleBench-Overall-Score-32K', 'weighted_average'],
+        ['NeedleBench-Overall-Score-128K', 'weighted_average'],
         ['babilong_0k', 'naive_average'],
         ['babilong_4k', 'naive_average'],
         ['babilong_16k', 'naive_average'],

From 45bcb87df25af5948ef147b8e4f750c768072535 Mon Sep 17 00:00:00 2001
From: littlegy <787321726@qq.com>
Date: Mon, 13 Apr 2026 18:50:42 +0800
Subject: [PATCH 10/10] update evaluate test

---
 autotest/config_h.yml                  | 102 ++++++------
 autotest/evaluate/test_api_evaluate.py | 212 ++++++++++---------------
 autotest/utils/config_utils.py         |  15 +-
 autotest/utils/constant.py             |   7 -
 4 files changed, 146 insertions(+), 190 deletions(-)

diff --git a/autotest/config_h.yml b/autotest/config_h.yml
index 675f1f6f4b..bbffee84fc 100644
--- a/autotest/config_h.yml
+++ b/autotest/config_h.yml
@@ -12,31 +12,31 @@ device: cuda
 
 config:
     tp:
-        Qwen/Qwen3-235B-A22B-Thinking-2507-FP8: 4
         Qwen/Qwen3-30B-A3B: 2
-        openai/gpt-oss-120b: 2
-        openai/gpt-oss-20b: 2
-        unsloth/gpt-oss-20b-BF16: 2
-        deepseek-ai/DeepSeek-V3.1: 8
-        Qwen/Qwen3-30B-A3B-Base: 2
         Qwen/Qwen3-235B-A22B-Thinking-2507: 8
-        OpenGVLab/InternVL3_5-38B: 2
+        Qwen/Qwen3-235B-A22B-Thinking-2507-FP8: 4
         Qwen/Qwen3-VL-30B-A3B-Instruct: 2
-        zai-org/GLM-5-FP8: 8
+        Qwen/Qwen2.5-VL-32B-Instruct: 2
         Qwen/Qwen3.5-27B: 2
         Qwen/Qwen3.5-35B-A3B: 2
         Qwen/Qwen3.5-122B-A10B: 4
+        Qwen/Qwen3-30B-A3B-Base: 2
         meta-llama/Llama-4-Scout-17B-16E-Instruct: 4
         meta-llama/Meta-Llama-3.1-70B-Instruct: 4
+        internlm/Intern-S1: 4
+        internlm/Intern-S1-Pro-FP8: 16
         OpenGVLab/InternVL3-38B: 2
-        Qwen/Qwen2.5-VL-32B-Instruct: 2
+        OpenGVLab/InternVL3_5-30B-A3B: 2
+        OpenGVLab/InternVL3_5-38B: 2
+        deepseek-ai/DeepSeek-V3.1: 8
         deepseek-ai/DeepSeek-V2-Lite-Chat: 2
         mistralai/Mixtral-8x7B-Instruct-v0.1: 2
-        OpenGVLab/InternVL3_5-30B-A3B: 2
         zai-org/GLM-4.7-Flash: 2
+        zai-org/GLM-5-FP8: 8
         google/gemma-3-27b-it: 2
-        internlm/Intern-S1: 4
-        internlm/Intern-S1-Pro-FP8: 16
+        openai/gpt-oss-120b: 2
+        openai/gpt-oss-20b: 2
+        unsloth/gpt-oss-20b-BF16: 2
 
     dp_ep:
         Qwen/Qwen3-235B-A22B-Thinking-2507:
@@ -61,9 +61,12 @@ turbomind_chat_model:
         - meta-llama/Meta-Llama-3-8B-Instruct
         - internlm/internlm3-8b-instruct
         - internlm/internlm3-8b-instruct-awq
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
         - OpenGVLab/InternVL3-8B
         - OpenGVLab/InternVL3-38B
         - OpenGVLab/InternVL3_5-30B-A3B
+        - OpenGVLab/InternVL3_5-38B
         - Qwen/Qwen3-0.6B
         - Qwen/Qwen3-4B
         - Qwen/Qwen3-8B
@@ -76,13 +79,10 @@ turbomind_chat_model:
         - Qwen/Qwen2.5-VL-32B-Instruct
         - Qwen/Qwen1.5-MoE-A2.7B-Chat
         - mistralai/Mixtral-8x7B-Instruct-v0.1
-        - OpenGVLab/InternVL3_5-38B
         - deepseek-ai/DeepSeek-V2-Lite-Chat
         - THUDM/glm-4-9b-chat
         - openai/gpt-oss-120b
         - openai/gpt-oss-20b
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
 
     cp_tp:
         - Qwen/Qwen3-235B-A22B-Thinking-2507
@@ -97,9 +97,13 @@ pytorch_chat_model:
         - meta-llama/Meta-Llama-3.1-70B-Instruct
         - meta-llama/Meta-Llama-3-8B-Instruct
         - internlm/internlm3-8b-instruct
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - internlm/Intern-S1-Pro-FP8
         - OpenGVLab/InternVL3-8B
         - OpenGVLab/InternVL3-38B
         - OpenGVLab/InternVL3_5-30B-A3B
+        - OpenGVLab/InternVL3_5-38B
         - Qwen/Qwen3-0.6B
         - Qwen/Qwen3-4B
         - Qwen/Qwen3-8B
@@ -110,25 +114,21 @@ pytorch_chat_model:
         - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
         - Qwen/Qwen3-VL-8B-Instruct
         - Qwen/Qwen3-VL-30B-A3B-Instruct
+        - Qwen/Qwen3.5-27B
+        - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-35B-A3B-FP8
+        - Qwen/Qwen3.5-122B-A10B
         - THUDM/cogvlm-chat-hf
         - THUDM/cogvlm2-llama3-chinese-chat-19B
         - THUDM/glm-4v-9b
         - THUDM/glm-4-9b-chat
-        - zai-org/GLM-4.7-Flash
         - microsoft/Phi-3.5-vision-instruct
         - microsoft/Phi-3-vision-128k-instruct
+        - zai-org/GLM-4.7-Flash
         - zai-org/GLM-5-FP8
-        - Qwen/Qwen3.5-27B
-        - Qwen/Qwen3.5-35B-A3B
-        - Qwen/Qwen3.5-35B-A3B-FP8
-        - Qwen/Qwen3.5-122B-A10B
         - deepseek-ai/DeepSeek-V3.1
-        - unsloth/gpt-oss-20b-BF16
         - google/gemma-3-27b-it
-        - OpenGVLab/InternVL3_5-38B
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
-        - internlm/Intern-S1-Pro-FP8
+        - unsloth/gpt-oss-20b-BF16
 
     dp_ep:
         - Qwen/Qwen3-235B-A22B-Thinking-2507
@@ -139,9 +139,9 @@ turbomind_vl_model:
         - OpenGVLab/InternVL3-8B
         - OpenGVLab/InternVL3-38B
         - OpenGVLab/InternVL3_5-30B-A3B
+        - OpenGVLab/InternVL3_5-38B
         - Qwen/Qwen2.5-VL-7B-Instruct
         - Qwen/Qwen2.5-VL-32B-Instruct
-        - OpenGVLab/InternVL3_5-38B
         - internlm/Intern-S1
         - internlm/Intern-S1-mini
 
@@ -151,15 +151,15 @@ pytorch_vl_model:
         - OpenGVLab/InternVL3_5-30B-A3B
         - Qwen/Qwen3-VL-8B-Instruct
         - Qwen/Qwen3-VL-30B-A3B-Instruct
+        - Qwen/Qwen3.5-27B
+        - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-35B-A3B-FP8
+        - Qwen/Qwen3.5-122B-A10B
         - THUDM/cogvlm-chat-hf
         - THUDM/cogvlm2-llama3-chinese-chat-19B
         - THUDM/glm-4v-9b
         - microsoft/Phi-3-vision-128k-instruct
         - microsoft/Phi-3.5-vision-instruct
-        - Qwen/Qwen3.5-27B
-        - Qwen/Qwen3.5-35B-A3B
-        - Qwen/Qwen3.5-35B-A3B-FP8
-        - Qwen/Qwen3.5-122B-A10B
         - internlm/Intern-S1
         - internlm/Intern-S1-mini
 
@@ -178,6 +178,9 @@ turbomind_quantization:
     no_awq:
         - meta-llama/Meta-Llama-3.1-70B-Instruct
         - internlm/internlm3-8b-instruct
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - OpenGVLab/InternVL3_5-30B-A3B
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-235B-A22B-Thinking-2507
         - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
@@ -188,15 +191,12 @@ turbomind_quantization:
         - Qwen/Qwen1.5-MoE-A2.7B-Chat
         - Qwen/Qwen2.5-VL-7B-Instruct
         - Qwen/Qwen2.5-VL-32B-Instruct
-        - OpenGVLab/InternVL3_5-30B-A3B
         - deepseek-ai/DeepSeek-V2-Lite-Chat
-        - openai/gpt-oss-120b
-        - openai/gpt-oss-20b
-        - microsoft/Phi-3-mini-4k-instruct
         - THUDM/glm-4v-9b
         - THUDM/glm-4-9b-chat
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
+        - microsoft/Phi-3-mini-4k-instruct
+        - openai/gpt-oss-120b
+        - openai/gpt-oss-20b
 
     gptq:
         - empty
@@ -219,10 +219,10 @@ turbomind_quantization:
         - Qwen/Qwen3-8B-Base
         - Qwen/Qwen3-30B-A3B-Base
         - deepseek-ai/DeepSeek-V2-Lite-Chat
-        - openai/gpt-oss-120b
-        - openai/gpt-oss-20b
         - internlm/Intern-S1
         - internlm/Intern-S1-mini
+        - openai/gpt-oss-120b
+        - openai/gpt-oss-20b
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Chat
         - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
@@ -246,8 +246,6 @@ pytorch_quantization:
         - meta-llama/Llama-3.2-1B-Instruct
         - OpenGVLab/InternVL3-2B
         - OpenGVLab/InternVL3-8B
-        - Qwen/Qwen3-8B-Base
-        - Qwen/Qwen3-30B-A3B-Base
         - Qwen/Qwen3-0.6B
         - Qwen/Qwen3-4B
         - Qwen/Qwen3-8B
@@ -259,13 +257,15 @@ pytorch_quantization:
         - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
         - Qwen/Qwen3-VL-8B-Instruct
         - Qwen/Qwen3-VL-30B-A3B-Instruct
+        - Qwen/Qwen3-8B-Base
+        - Qwen/Qwen3-30B-A3B-Base
+        - Qwen/Qwen3.5-27B
+        - Qwen/Qwen3.5-35B-A3B
+        - Qwen/Qwen3.5-122B-A10B
         - microsoft/Phi-3-vision-128k-instruct
         - microsoft/Phi-3.5-vision-instruct
         - zai-org/GLM-4.7-Flash
         - zai-org/GLM-5-FP8
-        - Qwen/Qwen3.5-27B
-        - Qwen/Qwen3.5-35B-A3B
-        - Qwen/Qwen3.5-122B-A10B
         - deepseek-ai/DeepSeek-V3.1
         - internlm/Intern-S1
         - internlm/Intern-S1-mini
@@ -284,11 +284,11 @@ pytorch_quantization:
 longtext_benchmark_model:
     - Qwen/Qwen3-30B-A3B
     - Qwen/Qwen3-235B-A22B-Thinking-2507
-    - zai-org/GLM-5-FP8
     - Qwen/Qwen3.5-27B
     - Qwen/Qwen3.5-35B-A3B
     - Qwen/Qwen3.5-35B-A3B-FP8
     - Qwen/Qwen3.5-122B-A10B
+    - zai-org/GLM-5-FP8
 
 benchmark_model:
     - meta-llama/Meta-Llama-3.1-8B-Instruct
@@ -296,14 +296,14 @@ benchmark_model:
     - Qwen/Qwen3-30B-A3B
     - Qwen/Qwen3-235B-A22B-Thinking-2507
     - Qwen/Qwen2.5-72B-Instruct
-    - openai/gpt-oss-120b
-    - openai/gpt-oss-20b
-    - unsloth/gpt-oss-20b-BF16
-    - zai-org/GLM-5-FP8
     - Qwen/Qwen3.5-27B
     - Qwen/Qwen3.5-35B-A3B
     - Qwen/Qwen3.5-35B-A3B-FP8
     - Qwen/Qwen3.5-122B-A10B
+    - openai/gpt-oss-120b
+    - openai/gpt-oss-20b
+    - unsloth/gpt-oss-20b-BF16
+    - zai-org/GLM-5-FP8
     - google/gemma-3-27b-it
     - internlm/Intern-S1
     - internlm/Intern-S1-mini
@@ -314,13 +314,13 @@ evaluate_model:
     - Qwen/Qwen3-30B-A3B-FP8
     - Qwen/Qwen3-235B-A22B-Thinking-2507
     - Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
-    - openai/gpt-oss-120b
-    - deepseek-ai/DeepSeek-V3.1
-    - zai-org/GLM-5-FP8
     - Qwen/Qwen3.5-27B
     - Qwen/Qwen3.5-35B-A3B
     - Qwen/Qwen3.5-35B-A3B-FP8
     - Qwen/Qwen3.5-122B-A10B
+    - openai/gpt-oss-120b
+    - deepseek-ai/DeepSeek-V3.1
+    - zai-org/GLM-5-FP8
     - internlm/Intern-S1-Pro-FP8
 
 longtext_evaluate_model:
diff --git a/autotest/evaluate/test_api_evaluate.py b/autotest/evaluate/test_api_evaluate.py
index 7877b29c04..3f72825ee0 100644
--- a/autotest/evaluate/test_api_evaluate.py
+++ b/autotest/evaluate/test_api_evaluate.py
@@ -1,4 +1,3 @@
-import copy
 import os
 import time
 
@@ -116,30 +115,22 @@ def _run_proxy_distributed_test(config,
 def run_eval_test(config, run_config, worker_id, test_type='infer', eval_config_name='default', eval_subpath=None):
     """Run test with specified evaluation configuration."""
     if eval_config_name == 'default':
-        longtext_key = run_config.get('_longtext_eval_config_name')
-        if longtext_key:
-            eval_config_name = longtext_key
-        else:
-            if 'gpt' in run_config.get('model', '').lower():
-                eval_config_name = 'gpt'
-            elif 'sdar' in run_config.get('model', '').lower():
-                eval_config_name = 'sdar'
-            elif 'intern-s1-pro' in run_config.get('model', '').lower():
-                eval_config_name = 'intern-s1-pro'
-            elif 'qwen3.5' in run_config.get('model', '').lower():
-                eval_config_name = 'qwen3.5'
-            if str(config.get('env_tag')) == 'a100':
-                eval_config_name = f'{eval_config_name}-32k'
-            elif str(config.get('env_tag')) == 'ascend':
-                eval_config_name = f'{eval_config_name}-2batch'
+        if 'gpt' in run_config.get('model', '').lower():
+            eval_config_name = 'gpt'
+        elif 'sdar' in run_config.get('model', '').lower():
+            eval_config_name = 'sdar'
+        elif 'intern-s1-pro' in run_config.get('model', '').lower():
+            eval_config_name = 'intern-s1-pro'
+        elif 'qwen3.5' in run_config.get('model', '').lower():
+            eval_config_name = 'qwen3.5'
+        if str(config.get('env_tag')) == 'a100':
+            eval_config_name = f'{eval_config_name}-32k'
+        elif str(config.get('env_tag')) == 'ascend':
+            eval_config_name = f'{eval_config_name}-2batch'
     preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})
     eval_path = config.get('eval_path')
     if eval_subpath:
-        rel = eval_subpath
-        nested = run_config.get('_eval_path_subdir')
-        if nested:
-            rel = os.path.join(rel, nested)
-        eval_path = os.path.join(eval_path, rel)
+        eval_path = os.path.join(eval_path, eval_subpath)
         os.makedirs(eval_path, exist_ok=True)
 
     total_gpus = int(os.environ.get('TOTAL_GPU_COUNT', '8'))
@@ -213,80 +204,11 @@ def run_openai_service_start(i):
             stop_restful_api(proxy_pid, proxy_process)
 
 
-def get_models(backend, parallel_config, session_len='auto'):
-    if session_len == 'auto':
-        configs = get_func_config_list(backend, parallel_config, func_type='evaluate', extra={})
-        result = []
-        for config in configs:
-            model = config.get('model', '')
-            if 'Qwen3.5' not in model:
-                if 'extra_params' not in config:
-                    config['extra_params'] = {}
-                config['extra_params']['session_len'] = 65536
-            result.append(config)
-        return result
-    else:
-        extra = {'session_len': session_len} if session_len is not None else {}
-        return get_func_config_list(backend, parallel_config, func_type='evaluate', extra=extra)
-
-
-def _resolve_longtext_eval_config_name(run_config: dict) -> str | None:
-    """Map longtext_evaluate config to EVAL_CONFIGS key; add branches when new
-    longtext families ship."""
-    ep = run_config.get('extra_params') or {}
-    raw = ep.get('session_len', ep.get('session-len'))
-    if raw is None:
-        return None
-    try:
-        sl = int(raw)
-    except (TypeError, ValueError):
-        return None
-    model_lower = (run_config.get('model') or '').lower()
-    if 'qwen3.5' in model_lower:
-        if sl >= 600000:
-            return 'longtext-512k'
-        if sl >= 300000:
-            return 'longtext-256k'
-    return None
-
-
-def get_longtext_models(backend, parallel_config, session_len='auto'):
-    if session_len == 'auto':
-        session_len = 65536
-    extra = {'session_len': session_len} if session_len is not None else {}
-    configs = get_func_config_list(backend, parallel_config, func_type='longtext_evaluate', extra=extra)
-    for cfg in configs:
-        preset_key = _resolve_longtext_eval_config_name(cfg)
-        if preset_key:
-            cfg['_longtext_eval_config_name'] = preset_key
-    return configs
-
-
-def get_mtp_models(backend, parallel_config):
-    base_configs = get_func_config_list(backend, parallel_config, func_type='mtp_evaluate', extra={})
-    for cfg in base_configs:
-        if 'qwen3.5' in cfg.get('model', '').lower():
-            cfg['extra_params'].update(constant.QWEN35_MTP_SERVER_EXTRA)
-
-    result_configs = []
-    for config in base_configs:
-        result_configs.append(config)
-
-        if config.get('model') == 'Qwen/Qwen3.5-35B-A3B' and parallel_config.get('tp') == 2:
-            fp8_config = copy.deepcopy(config)
-            fp8_config['extra_params']['max-prefill-token-num'] = 1024
-            fp8_config['extra_params']['model-format'] = 'fp8'
-            fp8_config['_eval_path_subdir'] = 'serve_fp8'
-            result_configs.append(fp8_config)
-
-    return result_configs
-
-
 @pytest.mark.infer
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 1}))
+@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 1}, func_type='evaluate'))
 def test_turbomind_infer_tp1(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
@@ -295,7 +217,7 @@ def test_turbomind_infer_tp1(config, run_config, worker_id):
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 2}))
+@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 2}, func_type='evaluate'))
 def test_turbomind_infer_tp2(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
@@ -304,7 +226,7 @@ def test_turbomind_infer_tp2(config, run_config, worker_id):
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_4
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 4}))
+@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 4}, func_type='evaluate'))
 def test_turbomind_infer_tp4(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
@@ -313,7 +235,7 @@ def test_turbomind_infer_tp4(config, run_config, worker_id):
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_8
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 8}))
+@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 8}, func_type='evaluate'))
 def test_turbomind_infer_tp8(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
@@ -322,7 +244,7 @@ def test_turbomind_infer_tp8(config, run_config, worker_id):
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_distributed_cp2tp8
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('turbomind', {'cp': 2, 'tp': 8}))
+@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'cp': 2, 'tp': 8}, func_type='evaluate'))
 def test_turbomind_infer_cp2tp8(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
@@ -332,7 +254,7 @@ def test_turbomind_infer_cp2tp8(config, run_config, worker_id):
 @pytest.mark.gpu_num_1
 @pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 1}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 1}, func_type='evaluate'))
 def test_pytorch_restful_tp1(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
@@ -342,7 +264,7 @@ def test_pytorch_restful_tp1(config, run_config, worker_id):
 @pytest.mark.gpu_num_2
 @pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 2}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 2}, func_type='evaluate'))
 def test_pytorch_restful_tp2(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
@@ -351,18 +273,34 @@ def test_pytorch_restful_tp2(config, run_config, worker_id):
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=400000))
+@pytest.mark.parametrize(
+    'run_config',
+    get_func_config_list(
+        'pytorch',
+        {'tp': 2},
+        func_type='longtext_evaluate',
+        extra={'session_len': 400000},
+    ),
+)
 def test_pytorch_restful_tp2_longtext(config, run_config, worker_id):
-    run_eval_test(config, run_config, worker_id, 'infer')
+    run_eval_test(config, run_config, worker_id, 'infer', eval_config_name='longtext-256k')
 
 
 @pytest.mark.infer
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=700000))
+@pytest.mark.parametrize(
+    'run_config',
+    get_func_config_list(
+        'pytorch',
+        {'tp': 2},
+        func_type='longtext_evaluate',
+        extra={'session_len': 700000},
+    ),
+)
 def test_pytorch_restful_tp2_longtext_512k(config, run_config, worker_id):
-    run_eval_test(config, run_config, worker_id, 'infer')
+    run_eval_test(config, run_config, worker_id, 'infer', eval_config_name='longtext-512k')
 
 
 @pytest.mark.infer
@@ -370,7 +308,7 @@ def test_pytorch_restful_tp2_longtext_512k(config, run_config, worker_id):
 @pytest.mark.gpu_num_2
 @pytest.mark.mtp
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 2}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 2}, func_type='mtp_evaluate'))
 def test_pytorch_restful_tp2_mtp(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer', eval_subpath='mtp')
 
@@ -380,7 +318,7 @@ def test_pytorch_restful_tp2_mtp(config, run_config, worker_id):
 @pytest.mark.gpu_num_1
 @pytest.mark.mtp
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 1}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 1}, func_type='mtp_evaluate'))
 def test_pytorch_restful_tp1_mtp(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer', eval_subpath='mtp')
 
@@ -390,7 +328,7 @@ def test_pytorch_restful_tp1_mtp(config, run_config, worker_id):
 @pytest.mark.gpu_num_2
 @pytest.mark.mtp
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 2}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 2}, func_type='mtp_evaluate'))
 def test_pytorch_eval_tp2_mtp(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval', eval_subpath='mtp')
 
@@ -400,7 +338,7 @@ def test_pytorch_eval_tp2_mtp(config, run_config, worker_id):
 @pytest.mark.gpu_num_1
 @pytest.mark.mtp
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_mtp_models('pytorch', {'tp': 1}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 1}, func_type='mtp_evaluate'))
 def test_pytorch_eval_tp1_mtp(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval', eval_subpath='mtp')
 
@@ -410,7 +348,7 @@ def test_pytorch_eval_tp1_mtp(config, run_config, worker_id):
 @pytest.mark.gpu_num_4
 @pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 4}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 4}, func_type='evaluate'))
 def test_pytorch_restful_tp4(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
@@ -420,7 +358,7 @@ def test_pytorch_restful_tp4(config, run_config, worker_id):
 @pytest.mark.gpu_num_8
 @pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 8}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 8}, func_type='evaluate'))
 def test_pytorch_restful_tp8(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
@@ -430,7 +368,7 @@ def test_pytorch_restful_tp8(config, run_config, worker_id):
 @pytest.mark.gpu_num_16
 @pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 16}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 16}, func_type='evaluate'))
 def test_pytorch_restful_tp16(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'infer')
 
@@ -439,7 +377,7 @@ def test_pytorch_restful_tp16(config, run_config, worker_id):
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_distributed_tp16
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 16}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 16}, func_type='evaluate'))
 def test_pytorch_restful_distributed_tp16(shared_ray_manager, config, run_config, worker_id):
     _run_ray_distributed_test(config=config,
                               run_config=run_config,
@@ -452,7 +390,7 @@ def test_pytorch_restful_distributed_tp16(shared_ray_manager, config, run_config
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_distributed_dpep8
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'dp': 8, 'ep': 8}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'dp': 8, 'ep': 8}, func_type='evaluate'))
 def test_pytorch_restful_distributed_dpep8(shared_proxy_manager, config, run_config, worker_id):
     _run_proxy_distributed_test(config=config,
                                 run_config=run_config,
@@ -465,7 +403,7 @@ def test_pytorch_restful_distributed_dpep8(shared_proxy_manager, config, run_con
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_distributed_dpep16
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'dp': 16, 'ep': 16}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'dp': 16, 'ep': 16}, func_type='evaluate'))
 def test_pytorch_restful_distributed_dpep16(shared_proxy_manager, config, run_config, worker_id):
     _run_proxy_distributed_test(config=config,
                                 run_config=run_config,
@@ -478,7 +416,7 @@ def test_pytorch_restful_distributed_dpep16(shared_proxy_manager, config, run_co
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 1}))
+@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 1}, func_type='evaluate'))
 def test_turbomind_eval_tp1(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -487,7 +425,7 @@ def test_turbomind_eval_tp1(config, run_config, worker_id):
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 2}))
+@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 2}, func_type='evaluate'))
 def test_turbomind_eval_tp2(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -496,7 +434,7 @@ def test_turbomind_eval_tp2(config, run_config, worker_id):
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_4
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 4}))
+@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 4}, func_type='evaluate'))
 def test_turbomind_eval_tp4(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -505,7 +443,7 @@ def test_turbomind_eval_tp4(config, run_config, worker_id):
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_8
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('turbomind', {'tp': 8}))
+@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'tp': 8}, func_type='evaluate'))
 def test_turbomind_eval_tp8(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -515,7 +453,7 @@ def test_turbomind_eval_tp8(config, run_config, worker_id):
 @pytest.mark.gpu_num_1
 @pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 1}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 1}, func_type='evaluate'))
 def test_pytorch_eval_tp1(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -525,7 +463,7 @@ def test_pytorch_eval_tp1(config, run_config, worker_id):
 @pytest.mark.gpu_num_2
 @pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 2}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 2}, func_type='evaluate'))
 def test_pytorch_eval_tp2(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -535,7 +473,7 @@ def test_pytorch_eval_tp2(config, run_config, worker_id):
 @pytest.mark.gpu_num_4
 @pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 4}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 4}, func_type='evaluate'))
 def test_pytorch_eval_tp4(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -545,7 +483,7 @@ def test_pytorch_eval_tp4(config, run_config, worker_id):
 @pytest.mark.gpu_num_8
 @pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 8}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 8}, func_type='evaluate'))
 def test_pytorch_eval_tp8(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -555,7 +493,7 @@ def test_pytorch_eval_tp8(config, run_config, worker_id):
 @pytest.mark.gpu_num_16
 @pytest.mark.test_ascend
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 16}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 16}, func_type='evaluate'))
 def test_pytorch_eval_tp16(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -564,7 +502,7 @@ def test_pytorch_eval_tp16(config, run_config, worker_id):
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_distributed_tp16
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'tp': 16}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'tp': 16}, func_type='evaluate'))
 def test_pytorch_eval_distributed_tp16(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -573,7 +511,7 @@ def test_pytorch_eval_distributed_tp16(config, run_config, worker_id):
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_distributed_dpep8
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'dp': 8, 'ep': 8}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'dp': 8, 'ep': 8}, func_type='evaluate'))
 def test_pytorch_eval_distributed_dpep8(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -582,7 +520,7 @@ def test_pytorch_eval_distributed_dpep8(config, run_config, worker_id):
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_distributed_dpep16
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('pytorch', {'dp': 16, 'ep': 16}))
+@pytest.mark.parametrize('run_config', get_func_config_list('pytorch', {'dp': 16, 'ep': 16}, func_type='evaluate'))
 def test_pytorch_eval_distributed_dpep16(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
 
@@ -591,24 +529,40 @@ def test_pytorch_eval_distributed_dpep16(config, run_config, worker_id):
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=400000))
+@pytest.mark.parametrize(
+    'run_config',
+    get_func_config_list(
+        'pytorch',
+        {'tp': 2},
+        func_type='longtext_evaluate',
+        extra={'session_len': 400000},
+    ),
+)
 def test_pytorch_eval_tp2_longtext(config, run_config, worker_id):
-    run_eval_test(config, run_config, worker_id, 'eval')
+    run_eval_test(config, run_config, worker_id, 'eval', eval_config_name='longtext-256k')
 
 
 @pytest.mark.eval
 @pytest.mark.pytorch
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_longtext_models('pytorch', {'tp': 2}, session_len=700000))
+@pytest.mark.parametrize(
+    'run_config',
+    get_func_config_list(
+        'pytorch',
+        {'tp': 2},
+        func_type='longtext_evaluate',
+        extra={'session_len': 700000},
+    ),
+)
 def test_pytorch_eval_tp2_longtext_512k(config, run_config, worker_id):
-    run_eval_test(config, run_config, worker_id, 'eval')
+    run_eval_test(config, run_config, worker_id, 'eval', eval_config_name='longtext-512k')
 
 
 @pytest.mark.eval
 @pytest.mark.turbomind
 @pytest.mark.gpu_num_distributed_cp2tp8
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('run_config', get_models('turbomind', {'cp': 2, 'tp': 8}))
+@pytest.mark.parametrize('run_config', get_func_config_list('turbomind', {'cp': 2, 'tp': 8}, func_type='evaluate'))
 def test_turbomind_eval_cp2tp8(config, run_config, worker_id):
     run_eval_test(config, run_config, worker_id, 'eval')
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 82883d189d..6e02c1efbb 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -49,7 +49,7 @@ def get_func_config_list(backend: str,
         parallel_config: Parallel config for tensor parallel
         model_type: Model type, default: chat_model
         func_type: Test func type filter, default: func
-        extra: extra config to update in each run config dict
+        extra: extra config merged into each run config's extra_params.
     Returns:
         list[dict]: All valid run config dicts
     """
@@ -110,8 +110,9 @@ def get_func_config_list(backend: str,
             run_config['extra_params']['max-batch-size'] = 128
             run_config['extra_params']['model-format'] = 'fp8'
 
-        if 'Qwen3.5' in run_config['model']:
-            run_config['extra_params']['session-len'] = 128000
+        if (func_type == 'evaluate' and 'session_len' not in extra
+                and 'session-len' not in extra and 'Qwen3.5' not in run_config['model']):
+            run_config['extra_params']['session_len'] = 65536
 
         if config.get('env_tag', '') in ['3090', '5080']:
             run_config['extra_params']['cache-max-entry-count'] = 0.5
@@ -143,6 +144,14 @@ def get_func_config_list(backend: str,
                 and func_type in ('benchmark', 'longtext_benchmark')):
             run_config['extra_params']['model-format'] = 'mxfp4'
 
+        if func_type == 'mtp_evaluate' and 'Qwen3.5' in run_config['model']:
+            run_config['extra_params'].update({
+                'reasoning-parser': 'qwen-qwq',
+                'speculative-algorithm': 'qwen3_5_mtp',
+                'speculative-num-draft-tokens': 4,
+                'max-batch-size': 256,
+            })
+
     return run_configs
 
 
diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py
index 1ac6d47d1d..9a366fca55 100644
--- a/autotest/utils/constant.py
+++ b/autotest/utils/constant.py
@@ -203,13 +203,6 @@
 
 BACKEND_LIST = ['turbomind', 'pytorch']
 
-QWEN35_MTP_SERVER_EXTRA = {
-    'reasoning-parser': 'qwen-qwq',
-    'speculative-algorithm': 'qwen3_5_mtp',
-    'speculative-num-draft-tokens': 4,
-    'max-batch-size': 256,
-}
-
 RESTFUL_MODEL_LIST = [
     'Qwen/Qwen3-0.6B', 'Qwen/Qwen3-VL-2B-Instruct', 'Qwen/Qwen3-30B-A3B', 'internlm/Intern-S1',
     'internlm/internlm2_5-20b', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', 'OpenGVLab/InternVL3-38B',