diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml index 6faaa136db..2d65d55275 100644 --- a/.github/workflows/api_eval.yml +++ b/.github/workflows/api_eval.yml @@ -43,7 +43,7 @@ env: HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }} - COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + COV_PARAM: --cov /opt/py3/lib/python3.12/site-packages/lmdeploy TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }} OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache @@ -58,7 +58,7 @@ jobs: if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}} strategy: matrix: - pyver: [py310] + pyver: [py312] runs-on: ubuntu-latest env: PYTHON_VERSION: ${{ matrix.pyver }} @@ -132,7 +132,7 @@ jobs: if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 with: - name: my-artifact-${{ github.run_id }}-py310 + name: my-artifact-${{ github.run_id }}-py312 - name: Copy Artifacts if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c106e5e043..327fc4b322 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -28,6 +28,16 @@ on: description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' type: boolean default: false + docker_tag: + required: true + description: 'Docker tag' + type: string + default: 'nightly-test-cu12.8' + result_tag: + required: true + description: 'result_tag if is not none, benchmark results will be uploaded to feishu' + type: string + default: "default" env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache @@ -45,7 +55,7 @@ jobs: if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}} strategy: matrix: - pyver: [py310] + pyver: [py312] runs-on: ubuntu-latest env: PYTHON_VERSION: ${{ matrix.pyver }} @@ -93,7 +103,7 @@ jobs: runs-on: [self-hosted, linux-a100] timeout-minutes: 50 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models @@ -117,7 +127,7 @@ jobs: if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 with: - name: my-artifact-${{ github.run_id }}-py310 + name: my-artifact-${{ github.run_id }}-py312 - name: Copy Artifacts if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} @@ -153,7 +163,7 @@ jobs: TEST_ENV: ${{ matrix.transformers }} timeout-minutes: 480 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -197,11 +207,25 @@ jobs: if: contains(fromJson(github.event.inputs.backend), 'pytorch') && !contains(fromJson(github.event.inputs.backend), 'turbomind') run: | pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n ${{matrix.n}} -m '${{matrix.gpu_num}} and not pr_test and not function and pytorch' --alluredir=${{env.ALLURE_REPORT_DIR}} + - name: Generate result + if: always() + run: | + cd /nvme/qa_test_models/feishu_upload + python3 test_benchmark.py --root ${{env.REPORT_DIR}} --output ${{env.REPORT_DIR}}/${{inputs.result_tag}}.txt --hardware A100 --infer-version ${{inputs.result_tag}} + - name: Async result + if: always() && inputs.result_tag != 'default' + env: + FEISHU_APP_ID: ${{secrets.FEISHU_APP_ID}} + FEISHU_APP_SECRET: ${{secrets.FEISHU_APP_SECRET}} + FEISHU_TABLE_TOKEN: ${{secrets.FEISHU_TABLE_TOKEN}} + FEISHU_TABLE_ID: ${{secrets.BENCHMARK_FEISHU_TABLE_ID}} + run: | + cd /nvme/qa_test_models/feishu_upload + python3 main.py --skip-duplicates ${{env.REPORT_DIR}}/${{inputs.result_tag}}.txt --config config-benchmark.py - name: Clear workfile if: always() run: | echo "status=done" >> ${{env.REPORT_DIR}}/status.txt - chmod -R 777 $REPORT_DIR export workdir=$(pwd) cd .. rm -rf $workdir diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index c4b99f8882..de053c6430 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -38,6 +38,11 @@ on: description: 'regression functions' type: string default: "['quant', 'tools','restful','pipeline','benchmark','evaluation']" + docker_tag: + required: true + description: 'Docker tag' + type: string + default: 'nightly-test-cu12.8' schedule: - cron: '00 14 * * 0-4' @@ -48,7 +53,7 @@ env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true ROOT_DIR: /nvme/qa_test_models REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} - COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + COV_PARAM: --cov /opt/py3/lib/python3.12/site-packages/lmdeploy TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt @@ -60,7 +65,7 @@ jobs: if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} strategy: matrix: - pyver: [py310] + pyver: [py312] runs-on: ubuntu-latest env: PYTHON_VERSION: ${{ matrix.pyver }} @@ -109,7 +114,7 @@ jobs: runs-on: [self-hosted, linux-a100] timeout-minutes: 50 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models @@ -131,7 +136,7 @@ jobs: if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 with: - name: my-artifact-${{ github.run_id }}-py310 + name: my-artifact-${{ github.run_id }}-py312 - name: Copy Artifacts if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} @@ -140,9 +145,13 @@ jobs: run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} - name: Mark as start run: | - chmod -R 777 ${{env.TEST_CODE_PATH}} mkdir ${{env.REPORT_DIR}} -p echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + - name: Clear workfile + if: always() + run: | + chmod -R 777 ${{env.TEST_CODE_PATH}} + chmod -R 777 ${{env.REPORT_DIR}} test_quantization: needs: download_pkgs @@ -158,7 +167,7 @@ jobs: MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules TEST_ENV: ${{ matrix.transformers }} container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -177,7 +186,7 @@ jobs: echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt - name: Install lmdeploy - dependency run: | - python3 -m pip install auto_gptq matplotlib attrdict + python3 -m pip install matplotlib attrdict python3 -m pip install -r requirements/lite.txt - name: Install lmdeploy run: | @@ -210,7 +219,6 @@ jobs: if: always() run: | echo "status=done" >> ${{env.REPORT_DIR}}/status.txt - chmod -R 777 ${{env.ROOT_DIR}} export workdir=$(pwd) cd .. rm -rf $workdir @@ -246,7 +254,7 @@ jobs: MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules TEST_ENV: ${{ matrix.transformers }} container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -330,7 +338,6 @@ jobs: if: always() run: | echo "status=done" >> ${{env.REPORT_DIR}}/status.txt - chmod -R 777 ${{env.ROOT_DIR}} export workdir=$(pwd) cd .. rm -rf $workdir @@ -422,7 +429,7 @@ jobs: extra: '--logprobs-mode raw_logprobs' timeout-minutes: 60 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -527,7 +534,6 @@ jobs: if: always() run: | echo "status=done" >> ${{env.REPORT_DIR}}/status.txt - chmod -R 777 ${{env.ROOT_DIR}} export workdir=$(pwd) cd .. rm -rf $workdir @@ -540,7 +546,7 @@ jobs: needs: test_quantization timeout-minutes: 240 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -590,7 +596,6 @@ jobs: if: always() run: | echo "status=done" >> ${{env.REPORT_DIR}}/status.txt - chmod -R 777 ${{env.ROOT_DIR}} export workdir=$(pwd) cd .. rm -rf $workdir @@ -604,7 +609,7 @@ jobs: needs: test_quantization timeout-minutes: 120 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -646,7 +651,6 @@ jobs: if: always() run: | echo "status=done" >> ${{env.REPORT_DIR}}/status.txt - chmod -R 777 ${{env.ROOT_DIR}} export workdir=$(pwd) cd .. rm -rf $workdir @@ -671,7 +675,7 @@ jobs: generate_type: base timeout-minutes: 60 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -777,7 +781,6 @@ jobs: if: always() run: | echo "status=done" >> ${{env.REPORT_DIR}}/status.txt - chmod -R 777 ${{env.ROOT_DIR}} export workdir=$(pwd) cd .. rm -rf $workdir @@ -790,7 +793,7 @@ jobs: needs: test_quantization timeout-minutes: 240 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -841,7 +844,6 @@ jobs: if: always() run: | echo "status=done" >> ${{env.REPORT_DIR}}/status.txt - chmod -R 777 ${{env.ROOT_DIR}} export workdir=$(pwd) cd .. rm -rf $workdir @@ -854,7 +856,7 @@ jobs: needs: [test_tools, test_restful, test_pipeline, test_benchmark] timeout-minutes: 5 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -866,7 +868,6 @@ jobs: run: cp -r ${{env.TEST_CODE_PATH}}/. . - name: Install lmdeploy run: | - echo "status=done" >> ${{env.REPORT_DIR}}/status.txt python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Get coverage report @@ -879,7 +880,7 @@ jobs: - name: Clear workfile if: always() run: | - chmod -R 777 ${{env.ROOT_DIR}} + chmod -R 777 ${{env.REPORT_DIR}} export workdir=$(pwd) cd .. rm -rf $workdir diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index c322a9fd20..f1350256d9 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -38,6 +38,11 @@ on: description: 'regression functions' type: string default: "['quant', 'tools', 'restful']" + docker_tag: + required: true + description: 'Docker tag' + type: string + default: 'nightly-test-cu12.8' schedule: - cron: '00 14 * * 0-4' @@ -47,7 +52,7 @@ env: OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} - COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + COV_PARAM: --cov /opt/py3/lib/python3.12/site-packages/lmdeploy FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy @@ -59,7 +64,7 @@ jobs: if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} strategy: matrix: - pyver: [py310] + pyver: [py312] runs-on: ubuntu-latest env: PYTHON_VERSION: ${{ matrix.pyver }} @@ -108,7 +113,7 @@ jobs: runs-on: [self-hosted, 3090-r1] timeout-minutes: 50 container: - image: openmmlab/lmdeploy:latest-cu12 + image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models @@ -131,7 +136,7 @@ jobs: if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 with: - name: my-artifact-${{ github.run_id }}-py310 + name: my-artifact-${{ github.run_id }}-py312 - name: Copy Artifacts if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} @@ -154,7 +159,7 @@ jobs: MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules TEST_ENV: 3090_legacy container: - image: openmmlab/lmdeploy:latest-cu12 + image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -169,7 +174,7 @@ jobs: echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt - name: Install lmdeploy - dependency run: | - python3 -m pip install auto_gptq matplotlib + python3 -m pip install matplotlib python3 -m pip install -r requirements/lite.txt - name: Install lmdeploy run: | @@ -232,7 +237,7 @@ jobs: MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules TEST_ENV: ${{matrix.transformers}} container: - image: openmmlab/lmdeploy:latest-cu12 + image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -317,7 +322,7 @@ jobs: generate_type: base timeout-minutes: 60 container: - image: openmmlab/lmdeploy:latest-cu12 + image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -406,7 +411,7 @@ jobs: needs: [test_tools, test_restful] timeout-minutes: 5 container: - image: openmmlab/lmdeploy:latest-cu12 + image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml index 7e263202e4..1d74878e3d 100644 --- a/.github/workflows/daily_ete_test_5080.yml +++ b/.github/workflows/daily_ete_test_5080.yml @@ -38,6 +38,11 @@ on: description: 'regression functions' type: string default: "['quant', 'tools', 'restful']" + docker_tag: + required: true + description: 'Docker tag' + type: string + default: 'nightly-test-cu12.8' schedule: - cron: '00 14 * * 0-4' @@ -47,7 +52,7 @@ env: OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} - COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + COV_PARAM: --cov /opt/py3/lib/python3.12/site-packages/lmdeploy FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy @@ -59,7 +64,7 @@ jobs: if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} strategy: matrix: - pyver: [py310] + pyver: [py312] runs-on: ubuntu-latest env: PYTHON_VERSION: ${{ matrix.pyver }} @@ -108,7 +113,7 @@ jobs: runs-on: [self-hosted, 5080-r1] timeout-minutes: 50 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models @@ -131,7 +136,7 @@ jobs: if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 with: - name: my-artifact-${{ github.run_id }}-py310 + name: my-artifact-${{ github.run_id }}-py312 - name: Copy Artifacts if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} @@ -154,7 +159,7 @@ jobs: MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules TEST_ENV: 5080 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -169,7 +174,7 @@ jobs: echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt - name: Install lmdeploy - dependency run: | - python3 -m pip install auto_gptq matplotlib + python3 -m pip install matplotlib python3 -m pip install -r requirements/lite.txt - name: Install lmdeploy run: | @@ -240,7 +245,7 @@ jobs: MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules TEST_ENV: ${{ matrix.transformers }} container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -334,7 +339,7 @@ jobs: generate_type: base timeout-minutes: 60 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -434,7 +439,7 @@ jobs: needs: [test_tools, test_restful] timeout-minutes: 5 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip diff --git a/.github/workflows/docker-nightly.yml b/.github/workflows/docker_nightly.yml similarity index 89% rename from .github/workflows/docker-nightly.yml rename to .github/workflows/docker_nightly.yml index 52ce78eec9..cd6521114f 100644 --- a/.github/workflows/docker-nightly.yml +++ b/.github/workflows/docker_nightly.yml @@ -8,12 +8,17 @@ on: description: 'Set branch or tag or commit id. Default is "main"' type: string default: 'main' + docker_tag: + required: false + description: 'Set Docker tag. Default is "cu12.8"' + type: string + default: 'cu12.8' schedule: - cron: '00 8 * * 0-4' env: - TAG: "openmmlab/lmdeploy:nightly-cu12.8" - DEV_TAG: "openmmlab/lmdeploy:nightly-test-cu12.8" + TAG: "openmmlab/lmdeploy:nightly-${{inputs.docker_tag || 'cu12.8'}}" + DEV_TAG: "openmmlab/lmdeploy:nightly-test-${{inputs.docker_tag || 'cu12.8'}}" jobs: publish_docker_image: @@ -76,7 +81,7 @@ jobs: needs: publish_docker_image env: INNER_REGISTRY: ${{ secrets.INNER_DOCKER_REGISTRY }} - INNER_TAG: "${{ secrets.INNER_DOCKER_REGISTRY }}/ailab-puyu-puyu_gpu/lmdeploy:nightly-cu12.8" + INNER_TAG: "${{ secrets.INNER_DOCKER_REGISTRY }}/ailab-puyu-puyu_gpu/lmdeploy:nightly-test-${{inputs.docker_tag || 'cu12.8'}}" steps: - name: Pull and push to inner run: | diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index a1c31d2c03..02358378f6 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -38,6 +38,11 @@ on: description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' type: boolean default: false + docker_tag: + required: true + description: 'Docker tag' + type: string + default: 'nightly-test-cu12.8' env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true @@ -48,7 +53,7 @@ jobs: if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}} strategy: matrix: - pyver: [py310] + pyver: [py312] runs-on: ubuntu-latest env: PYTHON_VERSION: ${{ matrix.pyver }} @@ -101,7 +106,7 @@ jobs: matrix: evaluate_type: ['base'] container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip @@ -131,7 +136,7 @@ jobs: if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 with: - name: my-artifact-${{ github.run_id }}-py310 + name: my-artifact-${{ github.run_id }}-py312 - name: Install lmdeploy - dependency run: | python3 -m pip install -r /root/models/offline_pkg/requirements.txt @@ -143,7 +148,7 @@ jobs: - name: Install lmdeploy - offline if: ${{inputs.offline_mode}} run: | - python3 -m pip install /root/models/offline_pkg/py310/lmdeploy-*.whl --no-deps + python3 -m pip install /root/models/offline_pkg/py312/lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install opencompass run: | diff --git a/.github/workflows/mllm_api_eval.yml b/.github/workflows/mllm_api_eval.yml index b27b558087..61ed13d8e5 100644 --- a/.github/workflows/mllm_api_eval.yml +++ b/.github/workflows/mllm_api_eval.yml @@ -32,14 +32,18 @@ on: description: 'Set custom run ID. If not provided, github.run_id will be used' type: string default: '' - + docker_tag: + required: true + description: 'Docker tag' + type: string + default: 'nightly-test-cu12.8' env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/mllm_evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }} - COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + COV_PARAM: --cov /opt/py3/lib/python3.12/site-packages/lmdeploy TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }} OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt @@ -58,7 +62,7 @@ jobs: if: ${{ !cancelled() }} strategy: matrix: - pyver: [py310] + pyver: [py312] runs-on: ubuntu-latest env: PYTHON_VERSION: ${{ matrix.pyver }} @@ -107,7 +111,7 @@ jobs: runs-on: [self-hosted, linux-a100] timeout-minutes: 50 container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/qa_test_models:/nvme/qa_test_models @@ -131,7 +135,7 @@ jobs: if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 with: - name: my-artifact-${{ github.run_id }}-py310 + name: my-artifact-${{ github.run_id }}-py312 - name: Copy Artifacts if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} @@ -158,7 +162,7 @@ jobs: env: TEST_ENV: ${{ matrix.transformers }} container: - image: openmmlab/lmdeploy:latest-cu12.8 + image: registry.h.pjlab.org.cn/ailab-puyu-puyu_gpu/lmdeploy:${{ inputs.docker_tag || 'nightly-test-cu12.8' }} options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip diff --git a/autotest/config_5080_legacy.yml b/autotest/config_5080_legacy.yml index 9d700e4240..677c5b5608 100644 --- a/autotest/config_5080_legacy.yml +++ b/autotest/config_5080_legacy.yml @@ -54,7 +54,13 @@ pytorch_base_model: turbomind_quantization: no_awq: + - meta-llama/Llama-3.2-3B-Instruct - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B gptq: - empty no_kvint4: @@ -71,13 +77,9 @@ turbomind_quantization: pytorch_quantization: awq: - - meta-llama/Llama-3.2-3B-Instruct - - OpenGVLab/InternVL2_5-1B - - Qwen/Qwen3-4B - - Qwen/Qwen3-1.7B - - Qwen/Qwen3-0.6B + - empty w8a8: - - meta-llama/Llama-3.2-3B-Instruct + - empty no_kvint4: - meta-llama/Llama-3.2-1B-Instruct - OpenGVLab/InternVL3-2B-Instruct diff --git a/autotest/interface/restful/test_restful_completions_v1.py b/autotest/interface/restful/test_restful_completions_v1.py index d3be161dfe..8fbc42ac8a 100644 --- a/autotest/interface/restful/test_restful_completions_v1.py +++ b/autotest/interface/restful/test_restful_completions_v1.py @@ -183,7 +183,10 @@ def test_batch_prompt_order(self, backend, model_case): max_tokens=400, min_tokens=50): print(str(item)) - assert '天' in item.get('choices')[1].get('text'), item.get('choices')[1].get('text') - assert '梅' in item.get('choices')[3].get('text') or '对仗' in item.get('choices')[3].get('text'), item.get( - 'choices')[3].get('text') - assert '7' in item.get('choices')[4].get('text'), item.get('choices')[4].get('text') + assert '天' in item.get('choices')[1].get('text') or '雨' in item.get('choices')[1].get( + 'text') or '伞' in item.get('choices')[1].get('text'), item.get('choices')[1].get('text') + assert '梅' in item.get('choices')[3].get('text') or '对仗' in item.get('choices')[3].get( + 'text') or '仄' in item.get('choices')[3].get('text') or '诗' in item.get('choices')[3].get( + 'text'), item.get('choices')[3].get('text') + assert '7' in item.get('choices')[4].get('text') or '5+2' in item.get('choices')[4].get('text'), item.get( + 'choices')[4].get('text') diff --git a/autotest/interface/restful/test_restful_generate.py b/autotest/interface/restful/test_restful_generate.py index e08b5c3a92..6babb550e7 100644 --- a/autotest/interface/restful/test_restful_generate.py +++ b/autotest/interface/restful/test_restful_generate.py @@ -12,6 +12,8 @@ from utils.constant import BACKEND_LIST, DEFAULT_SERVER, RESTFUL_MODEL_LIST from utils.toolkit import encode_text, parse_sse_stream +from lmdeploy.serve.openai.api_client import APIClient + BASE_HTTP_URL = f'http://{DEFAULT_SERVER}' DEFAULT_PORT = 23333 BASE_URL = ':'.join([BASE_HTTP_URL, str(DEFAULT_PORT)]) @@ -940,7 +942,16 @@ def test_skip_special_tokens(self, config): def test_stop_token_ids(self): print(f'\n[Model: {self.model_name}] Running stop_token_ids test') - payload = {'prompt': 'Once upon a time', 'max_tokens': 500, 'stop_token_ids': [11, 281], 'stream': False} + api_client = APIClient(BASE_URL) + input_ids1, length1 = api_client.encode('.', add_bos=False) + print(f'input_ids1={input_ids1}, length1={length1}') + + payload = { + 'prompt': 'Once upon a time', + 'max_tokens': 500, + 'stop_token_ids': input_ids1, + 'stream': False + } resp = self._post(payload) assert resp.status_code == 200, \ @@ -957,12 +968,12 @@ def test_stop_token_ids(self): finish_reason = data.get('meta_info', {}).get('finish_reason', {}).get('type', 'unknown') actual_length = len(generated_text) - print(f'\n stop_token_ids=[11, 281] generation result: length={actual_length}, ' + print(f'\n stop_token_ids={input_ids1} generation result: length={actual_length}, ' f"end reason='{finish_reason}', text='{generated_text[:20]}...'") assert finish_reason in ['stop'], \ f'Expected generation to end due to stop token, ' \ - f'actual reason: {finish_reason}. This may mean stop_token_ids [11, 281] ' \ + f'actual reason: {finish_reason}. This may mean stop_token_ids {input_ids1} ' \ f"didn't take effect, or generation was truncated." def test_combined_parameters(self): diff --git a/autotest/prompt_case.yml b/autotest/prompt_case.yml index 468f3e49d6..01e65d0506 100644 --- a/autotest/prompt_case.yml +++ b/autotest/prompt_case.yml @@ -10,6 +10,7 @@ memory_test: - 新疆 - uwumqi - Ürümqi + - uyghur - 介绍它的相应美食#please introduce some delicious foods: - contain: - urumqi @@ -19,6 +20,7 @@ memory_test: - 新疆 - uwumqi - Ürümqi + - uyghur chinese_poem_case: - 给我一首中文诗,需要添加标点符号,请用中文回答Give me a Chinese poem in Chinese: diff --git a/autotest/tools/common_case_config.py b/autotest/tools/common_case_config.py index 12334e8815..7c58f693a3 100644 --- a/autotest/tools/common_case_config.py +++ b/autotest/tools/common_case_config.py @@ -1,3 +1,5 @@ +import os + TURBOMIND_PR_TEST_LLM_GPU2 = [{ 'model': 'Qwen/Qwen3-30B-A3B', 'backend': 'turbomind', @@ -88,7 +90,7 @@ }] TURBOMIND_FALLBACK_TEST_LLM_GPU1 = [{ - 'model': 'THUDM/cogvlm-chat-hf', + 'model': 'google/gemma-2-9b-it', 'backend': 'turbomind', 'communicator': 'cuda-ipc', 'quant_policy': 8, @@ -382,6 +384,7 @@ } }] -SPECULATIVE_DECODING_RESTFUL_TEST_LLM = [{ - **item, 'backend': 'pytorch' +SPECULATIVE_DECODING_RESTFUL_TEST_LLM = [] if not os.getenv('TEST_ENV') or os.getenv('TEST_ENV') == 'legacy' else [{ + **item, 'backend': + 'pytorch' } for item in BASE_SPECULATIVE_DECODING_RESTFUL_TEST_LLM] diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py index f0c4d7bf07..427fb0c884 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py @@ -7,15 +7,13 @@ PYTORCH_LORA_TEST_LLM_GPU2, PYTORCH_PR_TEST_LLM_GPU1, PYTORCH_PR_TEST_LLM_GPU2, - REASONING_TEST_LLM, SPECULATIVE_DECODING_RESTFUL_TEST_LLM, - TOOLCALL_TEST_LLM, ) from utils.config_utils import get_case_str_by_config, get_func_config_list, get_workerid from utils.constant import PROXY_PORT from utils.proxy_distributed_utils import ApiServerPerTest, proxy_worker_node_wait from utils.ray_distributed_utils import ray_worker_node_wait -from utils.run_restful_chat import run_all_step, run_llm_test, run_reasoning_case, run_tools_case +from utils.run_restful_chat import run_all_step, run_llm_test BACKEND = 'pytorch' @@ -186,56 +184,6 @@ def test_pytorch_chat_with_lora_tp2(config, run_config, common_case_config, work run_llm_test(config, run_config, common_case_config, worker_id) -@pytest.mark.usefixtures('common_case_config') -@pytest.mark.flaky(reruns=0) -@pytest.mark.gpu_num_1 -@pytest.mark.parametrize( - 'run_config', - [item for item in REASONING_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 1]) -def test_restful_chat_reasoning_tp1(config, run_config, worker_id): - run_reasoning_case(config, run_config, worker_id) - - -@pytest.mark.usefixtures('common_case_config') -@pytest.mark.flaky(reruns=0) -@pytest.mark.gpu_num_2 -@pytest.mark.parametrize( - 'run_config', - [item for item in REASONING_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 2]) -def test_restful_chat_reasoning_tp2(config, run_config, worker_id): - run_reasoning_case(config, run_config, worker_id) - - -@pytest.mark.usefixtures('common_case_config') -@pytest.mark.flaky(reruns=0) -@pytest.mark.gpu_num_1 -@pytest.mark.parametrize( - 'run_config', - [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 1]) -def test_restful_chat_tools_tp1(config, run_config, worker_id): - run_tools_case(config, run_config, worker_id) - - -@pytest.mark.usefixtures('common_case_config') -@pytest.mark.flaky(reruns=0) -@pytest.mark.gpu_num_2 -@pytest.mark.parametrize( - 'run_config', - [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 2]) -def test_restful_chat_tools_tp2(config, run_config, worker_id): - run_tools_case(config, run_config, worker_id) - - -@pytest.mark.usefixtures('common_case_config') -@pytest.mark.flaky(reruns=0) -@pytest.mark.gpu_num_4 -@pytest.mark.parametrize( - 'run_config', - [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 4]) -def test_restful_chat_tools_tp4(config, run_config, worker_id): - run_tools_case(config, run_config, worker_id) - - @pytest.mark.usefixtures('common_case_config') @pytest.mark.flaky(reruns=0) @pytest.mark.gpu_num_1 diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py index a7460b6e72..9cc54c8d0c 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py @@ -1,8 +1,6 @@ import pytest from tools.common_case_config import ( MODELSCOPE_CONFIG, - REASONING_TEST_LLM, - TOOLCALL_TEST_LLM, TURBOMIND_FALLBACK_TEST_LLM_GPU1, TURBOMIND_FALLBACK_TEST_LLM_GPU2, TURBOMIND_LOGPROBS_TEST_LLM_GPU2, @@ -10,7 +8,7 @@ TURBOMIND_PR_TEST_LLM_GPU2, ) from utils.config_utils import get_func_config_list, get_workerid -from utils.run_restful_chat import run_llm_test, run_logprob_test, run_reasoning_case, run_tools_case +from utils.run_restful_chat import run_llm_test, run_logprob_test BACKEND = 'turbomind' @@ -104,53 +102,3 @@ def test_restful_logprobs(config, run_config, worker_id): def test_modelscope_restful_chat_tp1(config, run_config, common_case_config, worker_id): case_config = {k: v for k, v in common_case_config.items() if k == 'memory_test'} run_llm_test(config, run_config, case_config, worker_id) - - -@pytest.mark.usefixtures('common_case_config') -@pytest.mark.flaky(reruns=0) -@pytest.mark.gpu_num_1 -@pytest.mark.parametrize( - 'run_config', - [item for item in REASONING_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 1]) -def test_restful_chat_reasoning_tp1(config, run_config, worker_id): - run_reasoning_case(config, run_config, worker_id) - - -@pytest.mark.usefixtures('common_case_config') -@pytest.mark.flaky(reruns=0) -@pytest.mark.gpu_num_2 -@pytest.mark.parametrize( - 'run_config', - [item for item in REASONING_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 2]) -def test_restful_chat_reasoning_tp2(config, run_config, worker_id): - run_reasoning_case(config, run_config, worker_id) - - -@pytest.mark.usefixtures('common_case_config') -@pytest.mark.flaky(reruns=0) -@pytest.mark.gpu_num_1 -@pytest.mark.parametrize( - 'run_config', - [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 1]) -def test_restful_chat_tools_tp1(config, run_config, worker_id): - run_tools_case(config, run_config, worker_id) - - -@pytest.mark.usefixtures('common_case_config') -@pytest.mark.flaky(reruns=0) -@pytest.mark.gpu_num_2 -@pytest.mark.parametrize( - 'run_config', - [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 2]) -def test_restful_chat_tools_tp2(config, run_config, worker_id): - run_tools_case(config, run_config, worker_id) - - -@pytest.mark.usefixtures('common_case_config') -@pytest.mark.flaky(reruns=0) -@pytest.mark.gpu_num_4 -@pytest.mark.parametrize( - 'run_config', - [item for item in TOOLCALL_TEST_LLM if item['backend'] == BACKEND and item['parallel_config'].get('tp') == 4]) -def test_restful_chat_tools_tp4(config, run_config, worker_id): - run_tools_case(config, run_config, worker_id) diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index d06a128054..a4e8beb0a2 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -318,17 +318,21 @@ def get_quantization_model_list(type: str) -> list[str]: config = get_config() quant_model_list = [] - if type == 'awq': - # Get all turbomind chat/base models & deduplicate - turbo_chat = _extract_models_from_config( - config['turbomind_chat_model']) if 'turbomind_chat_model' in config else [] - turbo_base = _extract_models_from_config( - config['turbomind_base_model']) if 'turbomind_base_model' in config else [] - all_turbo_models = list(OrderedDict.fromkeys(turbo_chat + turbo_base)) + # Get all chat/base models & deduplicate + turbomind_chat = _extract_models_from_config( + config['turbomind_chat_model']) if 'turbomind_chat_model' in config else [] + turbomind_base = _extract_models_from_config( + config['turbomind_base_model']) if 'turbomind_base_model' in config else [] + all_turbomind_models = list(OrderedDict.fromkeys(turbomind_chat + turbomind_base)) + + pytorch_chat = _extract_models_from_config(config['pytorch_chat_model']) if 'pytorch_chat_model' in config else [] + pytorch_base = _extract_models_from_config(config['pytorch_base_model']) if 'pytorch_base_model' in config else [] + all_pytorch_models = list(OrderedDict.fromkeys(pytorch_chat + pytorch_base)) + if type == 'awq': # Filter turbomind valid awq models no_awq = config.get('turbomind_quantization', {}).get('no_awq', []) - quant_model_list = [m for m in all_turbo_models if m not in no_awq and not is_quantization_model(m)] + quant_model_list = [m for m in all_turbomind_models if m not in no_awq and not is_quantization_model(m)] # Append pytorch awq models torch_awq = config.get('pytorch_quantization', {}).get('awq', []) @@ -337,10 +341,15 @@ def get_quantization_model_list(type: str) -> list[str]: quant_model_list.append(model) elif type == 'gptq': - quant_model_list = config.get('turbomind_quantization', {}).get(type, []) - + gptq_model_list = config.get('turbomind_quantization', {}).get(type, []) + for model in gptq_model_list: + if model in all_turbomind_models: + quant_model_list.append(model) elif type == 'w8a8': - quant_model_list = config.get('pytorch_quantization', {}).get(type, []) + w8a8_model_list = config.get('pytorch_quantization', {}).get(type, []) + for model in w8a8_model_list: + if model in all_pytorch_models: + quant_model_list.append(model) return quant_model_list diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index 5f74e06589..1240227af4 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -178,7 +178,7 @@ def get_response_from_output_by_prompt(output_text, case, prompt): def assert_pipeline_single_return(output, logprobs_num: int = 0): result = assert_pipeline_single_element(output, is_last=True, logprobs_num=logprobs_num) if not result: - return result, 'single_stream_element is wrong' + return result, f'single_stream_element is wrong {output}' return result & (len(output.token_ids) == output.generate_token_len or len(output.token_ids) == output.generate_token_len - 1), 'token_is len is not correct'