diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml
index 6faaa136db..2d65d55275 100644
--- a/.github/workflows/api_eval.yml
+++ b/.github/workflows/api_eval.yml
@@ -43,7 +43,7 @@ env:
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }}
-  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  COV_PARAM: --cov /opt/py3/lib/python3.12/site-packages/lmdeploy
   TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }}
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache
@@ -58,7 +58,7 @@ jobs:
     if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
     strategy:
       matrix:
-        pyver: [py310]
+        pyver: [py312]
     runs-on: ubuntu-latest
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
@@ -132,7 +132,7 @@ jobs:
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
         with:
-          name: my-artifact-${{ github.run_id }}-py310
+          name: my-artifact-${{ github.run_id }}-py312
       - name: Copy Artifacts
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index c106e5e043..327fc4b322 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -28,6 +28,16 @@ on:
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
         type: boolean
         default: false
+      docker_tag:
+        required: true
+        description: 'Docker tag'
+        type: string
+        default: 'nightly-test-cu12.8'
+      result_tag:
+        required: true
+        description: 'result_tag if is not none, benchmark results will be uploaded to feishu'
+        type: string
+        default: "default"
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
@@ -45,7 +55,7 @@ jobs:
     if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
     strategy:
       matrix:
-        pyver: [py310]
+        pyver: [py312]
     runs-on: ubuntu-latest
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
@@ -93,7 +103,7 @@ jobs:
     runs-on: [self-hosted, linux-a100]
     timeout-minutes: 50
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
@@ -117,7 +127,7 @@ jobs:
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
         with:
-          name: my-artifact-${{ github.run_id }}-py310
+          name: my-artifact-${{ github.run_id }}-py312
       - name: Copy Artifacts
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
@@ -153,7 +163,7 @@ jobs:
       TEST_ENV: ${{ matrix.transformers }}
     timeout-minutes: 480
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -197,11 +207,25 @@ jobs:
         if: contains(fromJson(github.event.inputs.backend), 'pytorch') && !contains(fromJson(github.event.inputs.backend), 'turbomind')
         run: |
             pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n ${{matrix.n}} -m '${{matrix.gpu_num}} and not pr_test and not function and pytorch' --alluredir=${{env.ALLURE_REPORT_DIR}}
+      - name: Generate result
+        if: always()
+        run: |
+            cd /nvme/qa_test_models/feishu_upload
+            python3 test_benchmark.py --root ${{env.REPORT_DIR}} --output ${{env.REPORT_DIR}}/${{inputs.result_tag}}.txt --hardware A100 --infer-version ${{inputs.result_tag}}
+      - name: Async result
+        if: always() && inputs.result_tag != 'default'
+        env:
+          FEISHU_APP_ID: ${{secrets.FEISHU_APP_ID}}
+          FEISHU_APP_SECRET: ${{secrets.FEISHU_APP_SECRET}}
+          FEISHU_TABLE_TOKEN: ${{secrets.FEISHU_TABLE_TOKEN}}
+          FEISHU_TABLE_ID: ${{secrets.BENCHMARK_FEISHU_TABLE_ID}}
+        run: |
+            cd /nvme/qa_test_models/feishu_upload
+            python3 main.py --skip-duplicates ${{env.REPORT_DIR}}/${{inputs.result_tag}}.txt --config config-benchmark.py
       - name: Clear workfile
         if: always()
         run: |
           echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
-          chmod -R 777 $REPORT_DIR
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index c4b99f8882..de053c6430 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -38,6 +38,11 @@ on:
         description: 'regression functions'
         type: string
         default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']"
+      docker_tag:
+        required: true
+        description: 'Docker tag'
+        type: string
+        default: 'nightly-test-cu12.8'
   schedule:
     - cron:  '00 14 * * 0-4'
 
@@ -48,7 +53,7 @@ env:
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   ROOT_DIR: /nvme/qa_test_models
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
-  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  COV_PARAM: --cov /opt/py3/lib/python3.12/site-packages/lmdeploy
   TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
@@ -60,7 +65,7 @@ jobs:
     if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
     strategy:
       matrix:
-        pyver: [py310]
+        pyver: [py312]
     runs-on: ubuntu-latest
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
@@ -109,7 +114,7 @@ jobs:
     runs-on: [self-hosted, linux-a100]
     timeout-minutes: 50
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
@@ -131,7 +136,7 @@ jobs:
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
         with:
-          name: my-artifact-${{ github.run_id }}-py310
+          name: my-artifact-${{ github.run_id }}-py312
       - name: Copy Artifacts
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
@@ -140,9 +145,13 @@ jobs:
         run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
       - name: Mark as start
         run: |
-          chmod -R 777 ${{env.TEST_CODE_PATH}}
           mkdir ${{env.REPORT_DIR}} -p
           echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 ${{env.TEST_CODE_PATH}}
+          chmod -R 777 ${{env.REPORT_DIR}}
 
   test_quantization:
     needs: download_pkgs
@@ -158,7 +167,7 @@ jobs:
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
       TEST_ENV: ${{ matrix.transformers }}
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -177,7 +186,7 @@ jobs:
           echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install auto_gptq matplotlib attrdict
+          python3 -m pip install matplotlib attrdict
           python3 -m pip install -r requirements/lite.txt
       - name: Install lmdeploy
         run: |
@@ -210,7 +219,6 @@ jobs:
         if: always()
         run: |
           echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
-          chmod -R 777 ${{env.ROOT_DIR}}
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
@@ -246,7 +254,7 @@ jobs:
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
       TEST_ENV: ${{ matrix.transformers }}
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -330,7 +338,6 @@ jobs:
         if: always()
         run: |
           echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
-          chmod -R 777 ${{env.ROOT_DIR}}
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
@@ -422,7 +429,7 @@ jobs:
             extra: '--logprobs-mode raw_logprobs'
     timeout-minutes: 60
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -527,7 +534,6 @@ jobs:
         if: always()
         run: |
           echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
-          chmod -R 777 ${{env.ROOT_DIR}}
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
@@ -540,7 +546,7 @@ jobs:
     needs: test_quantization
     timeout-minutes: 240
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -590,7 +596,6 @@ jobs:
         if: always()
         run: |
           echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
-          chmod -R 777 ${{env.ROOT_DIR}}
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
@@ -604,7 +609,7 @@ jobs:
     needs: test_quantization
     timeout-minutes: 120
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -646,7 +651,6 @@ jobs:
         if: always()
         run: |
           echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
-          chmod -R 777 ${{env.ROOT_DIR}}
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
@@ -671,7 +675,7 @@ jobs:
             generate_type: base
     timeout-minutes: 60
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -777,7 +781,6 @@ jobs:
         if: always()
         run: |
           echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
-          chmod -R 777 ${{env.ROOT_DIR}}
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
@@ -790,7 +793,7 @@ jobs:
     needs: test_quantization
     timeout-minutes: 240
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -841,7 +844,6 @@ jobs:
         if: always()
         run: |
           echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
-          chmod -R 777 ${{env.ROOT_DIR}}
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
@@ -854,7 +856,7 @@ jobs:
     needs: [test_tools, test_restful, test_pipeline, test_benchmark]
     timeout-minutes: 5
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -866,7 +868,6 @@ jobs:
         run: cp -r ${{env.TEST_CODE_PATH}}/. .
       - name: Install lmdeploy
         run: |
-          echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
           python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Get coverage report
@@ -879,7 +880,7 @@ jobs:
       - name: Clear workfile
         if: always()
         run: |
-          chmod -R 777 ${{env.ROOT_DIR}}
+          chmod -R 777 ${{env.REPORT_DIR}}
           export workdir=$(pwd)
           cd ..
           rm -rf $workdir
diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index c322a9fd20..f1350256d9 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -38,6 +38,11 @@ on:
         description: 'regression functions'
         type: string
         default: "['quant', 'tools', 'restful']"
+      docker_tag:
+        required: true
+        description: 'Docker tag'
+        type: string
+        default: 'nightly-test-cu12.8'
   schedule:
     - cron:  '00 14 * * 0-4'
 
@@ -47,7 +52,7 @@ env:
   OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
-  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  COV_PARAM: --cov /opt/py3/lib/python3.12/site-packages/lmdeploy
   FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
   TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
@@ -59,7 +64,7 @@ jobs:
     if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
     strategy:
       matrix:
-        pyver: [py310]
+        pyver: [py312]
     runs-on: ubuntu-latest
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
@@ -108,7 +113,7 @@ jobs:
     runs-on: [self-hosted, 3090-r1]
     timeout-minutes: 50
     container:
-      image: openmmlab/lmdeploy:latest-cu12
+      image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
@@ -131,7 +136,7 @@ jobs:
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
         with:
-          name: my-artifact-${{ github.run_id }}-py310
+          name: my-artifact-${{ github.run_id }}-py312
       - name: Copy Artifacts
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
@@ -154,7 +159,7 @@ jobs:
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
       TEST_ENV: 3090_legacy
     container:
-      image: openmmlab/lmdeploy:latest-cu12
+      image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -169,7 +174,7 @@ jobs:
           echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install auto_gptq matplotlib
+          python3 -m pip install matplotlib
           python3 -m pip install -r requirements/lite.txt
       - name: Install lmdeploy
         run: |
@@ -232,7 +237,7 @@ jobs:
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
       TEST_ENV: ${{matrix.transformers}}
     container:
-      image: openmmlab/lmdeploy:latest-cu12
+      image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -317,7 +322,7 @@ jobs:
             generate_type: base
     timeout-minutes: 60
     container:
-      image: openmmlab/lmdeploy:latest-cu12
+      image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -406,7 +411,7 @@ jobs:
     needs: [test_tools, test_restful]
     timeout-minutes: 5
     container:
-      image: openmmlab/lmdeploy:latest-cu12
+      image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml
index 7e263202e4..1d74878e3d 100644
--- a/.github/workflows/daily_ete_test_5080.yml
+++ b/.github/workflows/daily_ete_test_5080.yml
@@ -38,6 +38,11 @@ on:
         description: 'regression functions'
         type: string
         default: "['quant', 'tools', 'restful']"
+      docker_tag:
+        required: true
+        description: 'Docker tag'
+        type: string
+        default: 'nightly-test-cu12.8'
   schedule:
     - cron:  '00 14 * * 0-4'
 
@@ -47,7 +52,7 @@ env:
   OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
-  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  COV_PARAM: --cov /opt/py3/lib/python3.12/site-packages/lmdeploy
   FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
   TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
@@ -59,7 +64,7 @@ jobs:
     if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
     strategy:
       matrix:
-        pyver: [py310]
+        pyver: [py312]
     runs-on: ubuntu-latest
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
@@ -108,7 +113,7 @@ jobs:
     runs-on: [self-hosted, 5080-r1]
     timeout-minutes: 50
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
@@ -131,7 +136,7 @@ jobs:
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
         with:
-          name: my-artifact-${{ github.run_id }}-py310
+          name: my-artifact-${{ github.run_id }}-py312
       - name: Copy Artifacts
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
@@ -154,7 +159,7 @@ jobs:
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
       TEST_ENV: 5080
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -169,7 +174,7 @@ jobs:
           echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install auto_gptq matplotlib
+          python3 -m pip install matplotlib
           python3 -m pip install -r requirements/lite.txt
       - name: Install lmdeploy
         run: |
@@ -240,7 +245,7 @@ jobs:
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
       TEST_ENV: ${{ matrix.transformers }}
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -334,7 +339,7 @@ jobs:
             generate_type: base
     timeout-minutes: 60
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -434,7 +439,7 @@ jobs:
     needs: [test_tools, test_restful]
     timeout-minutes: 5
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
diff --git a/.github/workflows/docker-nightly.yml b/.github/workflows/docker_nightly.yml
similarity index 89%
rename from .github/workflows/docker-nightly.yml
rename to .github/workflows/docker_nightly.yml
index 52ce78eec9..cd6521114f 100644
--- a/.github/workflows/docker-nightly.yml
+++ b/.github/workflows/docker_nightly.yml
@@ -8,12 +8,17 @@ on:
         description: 'Set branch or tag or commit id. Default is "main"'
         type: string
         default: 'main'
+      docker_tag:
+        required: false
+        description: 'Set Docker tag. Default is "cu12.8"'
+        type: string
+        default: 'cu12.8'
   schedule:
     - cron:  '00 8 * * 0-4'
 
 env:
-  TAG: "openmmlab/lmdeploy:nightly-cu12.8"
-  DEV_TAG: "openmmlab/lmdeploy:nightly-test-cu12.8"
+  TAG: "openmmlab/lmdeploy:nightly-${{inputs.docker_tag || 'cu12.8'}}"
+  DEV_TAG: "openmmlab/lmdeploy:nightly-test-${{inputs.docker_tag || 'cu12.8'}}"
 
 jobs:
   publish_docker_image:
@@ -76,7 +81,7 @@ jobs:
     needs: publish_docker_image
     env:
       INNER_REGISTRY: ${{ secrets.INNER_DOCKER_REGISTRY }}
-      INNER_TAG: "${{ secrets.INNER_DOCKER_REGISTRY }}/ailab-puyu-puyu_gpu/lmdeploy:nightly-cu12.8"
+      INNER_TAG: "${{ secrets.INNER_DOCKER_REGISTRY }}/ailab-puyu-puyu_gpu/lmdeploy:nightly-test-${{inputs.docker_tag || 'cu12.8'}}"
     steps:
       - name: Pull and push to inner
         run: |
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index a1c31d2c03..02358378f6 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -38,6 +38,11 @@ on:
         description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
         type: boolean
         default: false
+      docker_tag:
+        required: true
+        description: 'Docker tag'
+        type: string
+        default: 'nightly-test-cu12.8'
 
 env:
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
@@ -48,7 +53,7 @@ jobs:
     if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
     strategy:
       matrix:
-        pyver: [py310]
+        pyver: [py312]
     runs-on: ubuntu-latest
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
@@ -101,7 +106,7 @@ jobs:
       matrix:
         evaluate_type: ['base']
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
@@ -131,7 +136,7 @@ jobs:
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
         with:
-          name: my-artifact-${{ github.run_id }}-py310
+          name: my-artifact-${{ github.run_id }}-py312
       - name: Install lmdeploy - dependency
         run: |
           python3 -m pip install -r /root/models/offline_pkg/requirements.txt
@@ -143,7 +148,7 @@ jobs:
       - name: Install lmdeploy - offline
         if: ${{inputs.offline_mode}}
         run: |
-          python3 -m pip install /root/models/offline_pkg/py310/lmdeploy-*.whl --no-deps
+          python3 -m pip install /root/models/offline_pkg/py312/lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install opencompass
         run: |
diff --git a/.github/workflows/mllm_api_eval.yml b/.github/workflows/mllm_api_eval.yml
index b27b558087..61ed13d8e5 100644
--- a/.github/workflows/mllm_api_eval.yml
+++ b/.github/workflows/mllm_api_eval.yml
@@ -32,14 +32,18 @@ on:
         description: 'Set custom run ID. If not provided, github.run_id will be used'
         type: string
         default: ''
-
+      docker_tag:
+        required: true
+        description: 'Docker tag'
+        type: string
+        default: 'nightly-test-cu12.8'
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/mllm_evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }}
-  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  COV_PARAM: --cov /opt/py3/lib/python3.12/site-packages/lmdeploy
   TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }}
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
@@ -58,7 +62,7 @@ jobs:
     if: ${{ !cancelled() }}
     strategy:
       matrix:
-        pyver: [py310]
+        pyver: [py312]
     runs-on: ubuntu-latest
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
@@ -107,7 +111,7 @@ jobs:
     runs-on: [self-hosted, linux-a100]
     timeout-minutes: 50
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/qa_test_models:/nvme/qa_test_models
@@ -131,7 +135,7 @@ jobs:
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
         with:
-          name: my-artifact-${{ github.run_id }}-py310
+          name: my-artifact-${{ github.run_id }}-py312
       - name: Copy Artifacts
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
@@ -158,7 +162,7 @@ jobs:
     env:
       TEST_ENV: ${{ matrix.transformers }}
     container:
-      image: openmmlab/lmdeploy:latest-cu12.8
+      image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }}
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
diff --git a/autotest/config_5080_legacy.yml b/autotest/config_5080_legacy.yml
index 9d700e4240..677c5b5608 100644
--- a/autotest/config_5080_legacy.yml
+++ b/autotest/config_5080_legacy.yml
@@ -54,7 +54,13 @@ pytorch_base_model:
 
 turbomind_quantization:
     no_awq:
+        - meta-llama/Llama-3.2-3B-Instruct
         - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
     gptq:
         - empty
     no_kvint4:
@@ -71,13 +77,9 @@ turbomind_quantization:
 
 pytorch_quantization:
     awq:
-        - meta-llama/Llama-3.2-3B-Instruct
-        - OpenGVLab/InternVL2_5-1B
-        - Qwen/Qwen3-4B
-        - Qwen/Qwen3-1.7B
-        - Qwen/Qwen3-0.6B
+        - empty
     w8a8:
-        - meta-llama/Llama-3.2-3B-Instruct
+        - empty
     no_kvint4:
         - meta-llama/Llama-3.2-1B-Instruct
         - OpenGVLab/InternVL3-2B-Instruct
diff --git a/autotest/interface/restful/test_restful_completions_v1.py b/autotest/interface/restful/test_restful_completions_v1.py
index d3be161dfe..8fbc42ac8a 100644
--- a/autotest/interface/restful/test_restful_completions_v1.py
+++ b/autotest/interface/restful/test_restful_completions_v1.py
@@ -183,7 +183,10 @@ def test_batch_prompt_order(self, backend, model_case):
                                               max_tokens=400,
                                               min_tokens=50):
             print(str(item))
-            assert '天' in item.get('choices')[1].get('text'), item.get('choices')[1].get('text')
-            assert '梅' in item.get('choices')[3].get('text') or '对仗' in item.get('choices')[3].get('text'), item.get(
-                'choices')[3].get('text')
-            assert '7' in item.get('choices')[4].get('text'), item.get('choices')[4].get('text')
+            assert '天' in item.get('choices')[1].get('text') or '雨' in item.get('choices')[1].get(
+                'text') or '伞' in item.get('choices')[1].get('text'), item.get('choices')[1].get('text')
+            assert '梅' in item.get('choices')[3].get('text') or '对仗' in item.get('choices')[3].get(
+                'text') or '仄' in item.get('choices')[3].get('text') or '诗' in item.get('choices')[3].get(
+                    'text'), item.get('choices')[3].get('text')
+            assert '7' in item.get('choices')[4].get('text') or '5+2' in item.get('choices')[4].get('text'), item.get(
+                'choices')[4].get('text')
diff --git a/autotest/interface/restful/test_restful_generate.py b/autotest/interface/restful/test_restful_generate.py
index e08b5c3a92..6babb550e7 100644
--- a/autotest/interface/restful/test_restful_generate.py
+++ b/autotest/interface/restful/test_restful_generate.py
@@ -12,6 +12,8 @@
 from utils.constant import BACKEND_LIST, DEFAULT_SERVER, RESTFUL_MODEL_LIST
 from utils.toolkit import encode_text, parse_sse_stream
 
+from lmdeploy.serve.openai.api_client import APIClient
+
 BASE_HTTP_URL = f'http://{DEFAULT_SERVER}'
 DEFAULT_PORT = 23333
 BASE_URL = ':'.join([BASE_HTTP_URL, str(DEFAULT_PORT)])
@@ -940,7 +942,16 @@ def test_skip_special_tokens(self, config):
 
     def test_stop_token_ids(self):
         print(f'\n[Model: {self.model_name}] Running stop_token_ids test')
-        payload = {'prompt': 'Once upon a time', 'max_tokens': 500, 'stop_token_ids': [11, 281], 'stream': False}
+        api_client = APIClient(BASE_URL)
+        input_ids1, length1 = api_client.encode('.', add_bos=False)
+        print(f'input_ids1={input_ids1}, length1={length1}')
+
+        payload = {
+            'prompt': 'Once upon a time',
+            'max_tokens': 500,
+            'stop_token_ids': input_ids1,
+            'stream': False
+        }
 
         resp = self._post(payload)
         assert resp.status_code == 200, \
@@ -957,12 +968,12 @@ def test_stop_token_ids(self):
         finish_reason = data.get('meta_info', {}).get('finish_reason', {}).get('type', 'unknown')
         actual_length = len(generated_text)
 
-        print(f'\n stop_token_ids=[11, 281] generation result: length={actual_length}, '
+        print(f'\n stop_token_ids={input_ids1} generation result: length={actual_length}, '
               f"end reason='{finish_reason}', text='{generated_text[:20]}...'")
 
         assert finish_reason in ['stop'], \
             f'Expected generation to end due to stop token, ' \
-            f'actual reason: {finish_reason}. This may mean stop_token_ids [11, 281] ' \
+            f'actual reason: {finish_reason}. This may mean stop_token_ids {input_ids1} ' \
             f"didn't take effect, or generation was truncated."
 
     def test_combined_parameters(self):
diff --git a/autotest/prompt_case.yml b/autotest/prompt_case.yml
index 468f3e49d6..01e65d0506 100644
--- a/autotest/prompt_case.yml
+++ b/autotest/prompt_case.yml
@@ -10,6 +10,7 @@ memory_test:
             - 新疆
             - uwumqi
             - Ürümqi
+            - uyghur
     - 介绍它的相应美食#please introduce some delicious foods:
         - contain:
             - urumqi
@@ -19,6 +20,7 @@ memory_test:
             - 新疆
             - uwumqi
             - Ürümqi
+            - uyghur
 
 chinese_poem_case:
     - 给我一首中文诗，需要添加标点符号，请用中文回答Give me a Chinese poem in Chinese:
diff --git a/autotest/tools/common_case_config.py b/autotest/tools/common_case_config.py
index 12334e8815..7c58f693a3 100644
--- a/autotest/tools/common_case_config.py
+++ b/autotest/tools/common_case_config.py
@@ -1,3 +1,5 @@
+import os
+
 TURBOMIND_PR_TEST_LLM_GPU2 = [{
     'model': 'Qwen/Qwen3-30B-A3B',
     'backend': 'turbomind',
@@ -88,7 +90,7 @@
 }]
 
 TURBOMIND_FALLBACK_TEST_LLM_GPU1 = [{
-    'model': 'THUDM/cogvlm-chat-hf',
+    'model': 'google/gemma-2-9b-it',
     'backend': 'turbomind',
     'communicator': 'cuda-ipc',
     'quant_policy': 8,
@@ -382,6 +384,7 @@
     }
 }]
 
-SPECULATIVE_DECODING_RESTFUL_TEST_LLM = [{
-    **item, 'backend': 'pytorch'
+SPECULATIVE_DECODING_RESTFUL_TEST_LLM = [] if not os.getenv('TEST_ENV') or os.getenv('TEST_ENV') == 'legacy' else [{
+    **item, 'backend':
+    'pytorch'
 } for item in BASE_SPECULATIVE_DECODING_RESTFUL_TEST_LLM]
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index f0c4d7bf07..427fb0c884 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -7,15 +7,13 @@
     PYTORCH_LORA_TEST_LLM_GPU2,
     PYTORCH_PR_TEST_LLM_GPU1,
     PYTORCH_PR_TEST_LLM_GPU2,
-    REASONING_TEST_LLM,
     SPECULATIVE_DECODING_RESTFUL_TEST_LLM,
-    TOOLCALL_TEST_LLM,
 )
 from utils.config_utils import get_case_str_by_config, get_func_config_list, get_workerid
 from utils.constant import PROXY_PORT
 from utils.proxy_distributed_utils import ApiServerPerTest, proxy_worker_node_wait
 from utils.ray_distributed_utils import ray_worker_node_wait
-from utils.run_restful_chat import run_all_step, run_llm_test, run_reasoning_case, run_tools_case
+from utils.run_restful_chat import run_all_step, run_llm_test
 
 BACKEND = 'pytorch'
 
@@ -186,56 +184,6 @@ def test_pytorch_chat_with_lora_tp2(config, run_config, common_case_config, work
     run_llm_test(config, run_config, common_case_config, worker_id)
 
 
-@pytest.mark.usefixtures('common_case_config')
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize(
-    'run_config',
-    [item for item in REASONING_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 1])
-def test_restful_chat_reasoning_tp1(config, run_config, worker_id):
-    run_reasoning_case(config, run_config, worker_id)
-
-
-@pytest.mark.usefixtures('common_case_config')
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize(
-    'run_config',
-    [item for item in REASONING_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 2])
-def test_restful_chat_reasoning_tp2(config, run_config, worker_id):
-    run_reasoning_case(config, run_config, worker_id)
-
-
-@pytest.mark.usefixtures('common_case_config')
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize(
-    'run_config',
-    [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 1])
-def test_restful_chat_tools_tp1(config, run_config, worker_id):
-    run_tools_case(config, run_config, worker_id)
-
-
-@pytest.mark.usefixtures('common_case_config')
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize(
-    'run_config',
-    [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 2])
-def test_restful_chat_tools_tp2(config, run_config, worker_id):
-    run_tools_case(config, run_config, worker_id)
-
-
-@pytest.mark.usefixtures('common_case_config')
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.gpu_num_4
-@pytest.mark.parametrize(
-    'run_config',
-    [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 4])
-def test_restful_chat_tools_tp4(config, run_config, worker_id):
-    run_tools_case(config, run_config, worker_id)
-
-
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.gpu_num_1
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index a7460b6e72..9cc54c8d0c 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -1,8 +1,6 @@
 import pytest
 from tools.common_case_config import (
     MODELSCOPE_CONFIG,
-    REASONING_TEST_LLM,
-    TOOLCALL_TEST_LLM,
     TURBOMIND_FALLBACK_TEST_LLM_GPU1,
     TURBOMIND_FALLBACK_TEST_LLM_GPU2,
     TURBOMIND_LOGPROBS_TEST_LLM_GPU2,
@@ -10,7 +8,7 @@
     TURBOMIND_PR_TEST_LLM_GPU2,
 )
 from utils.config_utils import get_func_config_list, get_workerid
-from utils.run_restful_chat import run_llm_test, run_logprob_test, run_reasoning_case, run_tools_case
+from utils.run_restful_chat import run_llm_test, run_logprob_test
 
 BACKEND = 'turbomind'
 
@@ -104,53 +102,3 @@ def test_restful_logprobs(config, run_config, worker_id):
 def test_modelscope_restful_chat_tp1(config, run_config, common_case_config, worker_id):
     case_config = {k: v for k, v in common_case_config.items() if k == 'memory_test'}
     run_llm_test(config, run_config, case_config, worker_id)
-
-
-@pytest.mark.usefixtures('common_case_config')
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize(
-    'run_config',
-    [item for item in REASONING_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 1])
-def test_restful_chat_reasoning_tp1(config, run_config, worker_id):
-    run_reasoning_case(config, run_config, worker_id)
-
-
-@pytest.mark.usefixtures('common_case_config')
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize(
-    'run_config',
-    [item for item in REASONING_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 2])
-def test_restful_chat_reasoning_tp2(config, run_config, worker_id):
-    run_reasoning_case(config, run_config, worker_id)
-
-
-@pytest.mark.usefixtures('common_case_config')
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.gpu_num_1
-@pytest.mark.parametrize(
-    'run_config',
-    [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 1])
-def test_restful_chat_tools_tp1(config, run_config, worker_id):
-    run_tools_case(config, run_config, worker_id)
-
-
-@pytest.mark.usefixtures('common_case_config')
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.gpu_num_2
-@pytest.mark.parametrize(
-    'run_config',
-    [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 2])
-def test_restful_chat_tools_tp2(config, run_config, worker_id):
-    run_tools_case(config, run_config, worker_id)
-
-
-@pytest.mark.usefixtures('common_case_config')
-@pytest.mark.flaky(reruns=0)
-@pytest.mark.gpu_num_4
-@pytest.mark.parametrize(
-    'run_config',
-    [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 4])
-def test_restful_chat_tools_tp4(config, run_config, worker_id):
-    run_tools_case(config, run_config, worker_id)
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index d06a128054..a4e8beb0a2 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -318,17 +318,21 @@ def get_quantization_model_list(type: str) -> list[str]:
     config = get_config()
     quant_model_list = []
 
-    if type == 'awq':
-        # Get all turbomind chat/base models & deduplicate
-        turbo_chat = _extract_models_from_config(
-            config['turbomind_chat_model']) if 'turbomind_chat_model' in config else []
-        turbo_base = _extract_models_from_config(
-            config['turbomind_base_model']) if 'turbomind_base_model' in config else []
-        all_turbo_models = list(OrderedDict.fromkeys(turbo_chat + turbo_base))
+    # Get all chat/base models & deduplicate
+    turbomind_chat = _extract_models_from_config(
+        config['turbomind_chat_model']) if 'turbomind_chat_model' in config else []
+    turbomind_base = _extract_models_from_config(
+        config['turbomind_base_model']) if 'turbomind_base_model' in config else []
+    all_turbomind_models = list(OrderedDict.fromkeys(turbomind_chat + turbomind_base))
+
+    pytorch_chat = _extract_models_from_config(config['pytorch_chat_model']) if 'pytorch_chat_model' in config else []
+    pytorch_base = _extract_models_from_config(config['pytorch_base_model']) if 'pytorch_base_model' in config else []
+    all_pytorch_models = list(OrderedDict.fromkeys(pytorch_chat + pytorch_base))
 
+    if type == 'awq':
         # Filter turbomind valid awq models
         no_awq = config.get('turbomind_quantization', {}).get('no_awq', [])
-        quant_model_list = [m for m in all_turbo_models if m not in no_awq and not is_quantization_model(m)]
+        quant_model_list = [m for m in all_turbomind_models if m not in no_awq and not is_quantization_model(m)]
 
         # Append pytorch awq models
         torch_awq = config.get('pytorch_quantization', {}).get('awq', [])
@@ -337,10 +341,15 @@ def get_quantization_model_list(type: str) -> list[str]:
                 quant_model_list.append(model)
 
     elif type == 'gptq':
-        quant_model_list = config.get('turbomind_quantization', {}).get(type, [])
-
+        gptq_model_list = config.get('turbomind_quantization', {}).get(type, [])
+        for model in gptq_model_list:
+            if model in all_turbomind_models:
+                quant_model_list.append(model)
     elif type == 'w8a8':
-        quant_model_list = config.get('pytorch_quantization', {}).get(type, [])
+        w8a8_model_list = config.get('pytorch_quantization', {}).get(type, [])
+        for model in w8a8_model_list:
+            if model in all_pytorch_models:
+                quant_model_list.append(model)
 
     return quant_model_list
 
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index 5f74e06589..1240227af4 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -178,7 +178,7 @@ def get_response_from_output_by_prompt(output_text, case, prompt):
 def assert_pipeline_single_return(output, logprobs_num: int = 0):
     result = assert_pipeline_single_element(output, is_last=True, logprobs_num=logprobs_num)
     if not result:
-        return result, 'single_stream_element is wrong'
+        return result, f'single_stream_element is wrong {output}'
     return result & (len(output.token_ids) == output.generate_token_len
                      or len(output.token_ids) == output.generate_token_len - 1), 'token_is len is not correct'