diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml index f8662c699b..ca4ce83467 100644 --- a/.github/workflows/api_eval.yml +++ b/.github/workflows/api_eval.yml @@ -32,7 +32,11 @@ on: description: 'Set custom run ID. If not provided, github.run_id will be used' type: string default: '' - + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache @@ -40,11 +44,8 @@ env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }} COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy - FAIL_CONFIG: '--lf' TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }} OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy - OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt - DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache HF_DATASETS_OFFLINE: 1 HF_DATASETS_CACHE: /nvme/qa_test_models/hf_datasets @@ -54,7 +55,7 @@ env: jobs: linux-build: - if: ${{ !cancelled() }} + if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}} strategy: matrix: pyver: [py310] diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_legacy.yml similarity index 57% rename from .github/workflows/api_eval_h800.yml rename to .github/workflows/api_eval_legacy.yml index dc5678927f..e85729378c 100644 --- a/.github/workflows/api_eval_h800.yml +++ b/.github/workflows/api_eval_legacy.yml @@ -1,4 +1,4 @@ -name: api_eval_h800 +name: api_eval_legacy on: workflow_dispatch: @@ -32,31 +32,32 @@ on: description: 'Set custom run ID. If not provided, github.run_id will be used' type: string default: '' + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai - OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }} COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy - FAIL_CONFIG: '--lf' TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }} OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy - OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt - DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache HF_DATASETS_OFFLINE: 1 HF_DATASETS_CACHE: /nvme/qa_test_models/hf_datasets HF_HUB_OFFLINE: 1 HF_EVALUATE_OFFLINE: 1 RUN_ID: ${{ inputs.repo_ref }}_${{ github.run_id }} - TEST_ENV: h800 + TEST_ENV: legacy jobs: linux-build: - if: ${{ !cancelled() }} + if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}} strategy: matrix: pyver: [py310] @@ -67,8 +68,20 @@ jobs: DOCKER_TAG: cuda12.8 OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }} steps: + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + # This might remove tools that are actually needed, if set to "true" but frees about 6 GB + tool-cache: false + docker-images: false + # All of these default to true, but feel free to set to "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: false - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v6 with: repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} ref: ${{github.event.inputs.repo_ref || 'main'}} @@ -90,56 +103,95 @@ jobs: retention-days: 1 name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} - test_evaluation: + + download_pkgs: needs: linux-build - if: ${{ !cancelled() }} - runs-on: [self-hosted, h800-r1] - timeout-minutes: 2400 - strategy: - fail-fast: false - matrix: - backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + if: ${{!cancelled()}} + runs-on: [self-hosted, linux-a100] + timeout-minutes: 50 container: - image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 + image: openmmlab/lmdeploy:latest-cu12.8 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" volumes: - - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/github-actions/packages:/root/packages - - /nvme/github-actions/resources:/root/resources - - /nvme/github-actions/opencompass-data:/root/opencompass-data - /nvme/qa_test_models:/nvme/qa_test_models - - /nvme1/qa_test_models:/nvme1/qa_test_models - - /nvme2/share:/nvme2/share - - /mnt/158_nvme2:/mnt/158_nvme2 - - /mnt/158_nvme3:/mnt/158_nvme3 - - /mnt/158_nvme4:/mnt/158_nvme4 + - /mnt/121:/mnt/121 + - /mnt/104:/mnt/104 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: - - name: Create and change to _wk directory - run: | - echo "Working directory set to: $(pwd)" - name: Clone repository uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} with: repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} uses: actions/download-artifact@v4 with: name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Mark as start + run: | + chmod -R 777 ${{env.TEST_CODE_PATH}} + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + + test_evaluation: + needs: download_pkgs + if: ${{ !cancelled() }} + runs-on: [self-hosted, linux-a100] + timeout-minutes: 7200 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + gpu_num: ['gpu_num_1', 'gpu_num_2', 'gpu_num_4', 'gpu_num_8'] + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/resources:/root/resources + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme/huggingface_hub:/nvme/huggingface_hub + - /mnt/121:/mnt/121 + - /mnt/104:/mnt/104 + - /mnt/bigdisk:/mnt/bigdisk + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt - name: Install lmdeploy - dependency run: | - python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt - name: Install lmdeploy run: | - python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt - name: Install opencompass run: | - python3 -m pip install opencompass + git clone https://github.com/open-compass/opencompass.git --depth 1 + cd opencompass + python3 -m pip install . python3 -m pip install langdetect - name: Check env run: | + pip install transformers==4.57.6 python3 -m pip list lmdeploy check_env mkdir ${{env.REPORT_DIR}} -p @@ -148,17 +200,15 @@ jobs: if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') run: | overall_exit=0 - ln -s /nvme/qa_test_models/resource/opencompass-data/data ./data + ln -s /mnt/104/opencompass-data/data ./data ln -s /nvme/qa_test_models/resource/nltk_data /usr/share/nltk_data execution_mode="${{ github.event.inputs.execution_mode || 'both' }}" + ulimit -n 65535 if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and ${{matrix.backend}} and infer" -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and ${{matrix.backend}} and infer" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and ${{matrix.backend}} and infer" -n 2 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? - pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and ${{matrix.backend}} and infer" -n 1 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and infer" --alluredir=${{env.REPORT_DIR}} || overall_exit=$? fi if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then - pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.backend}} and eval" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and eval" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? fi exit $overall_exit - name: Clear workspace diff --git a/.github/workflows/benchmark_legacy.yml b/.github/workflows/benchmark_legacy.yml new file mode 100644 index 0000000000..68d7eafb25 --- /dev/null +++ b/.github/workflows/benchmark_legacy.yml @@ -0,0 +1,204 @@ +name: benchmark_test_legacy + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + benchmark_type: + required: true + description: 'Set benchmark type. Default is "["longtext", "throughput", "api_server", "prefixcache"]"' + type: string + default: "['apiserver', 'mllm_apiserver', 'throughput', 'longtext', 'prefixcache']" + backend: + required: true + description: 'Set backend filter. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }} + REPORT_DIR: /nvme/qa_test_models/benchmark_report/${{ inputs.repo_ref }}_${{ github.run_id }} + ALLURE_REPORT_DIR: /nvme/qa_test_models/benchmark_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }} + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }} + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + RUN_ID: ${{ inputs.repo_ref }}_${{ github.run_id }} + TEST_ENV: legacy + +jobs: + linux-build: + if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.8 + steps: + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + # This might remove tools that are actually needed, if set to "true" but frees about 6 GB + tool-cache: false + docker-images: false + # All of these default to true, but feel free to set to "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: false + - name: Checkout repository + uses: actions/checkout@v6 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + download_pkgs: + needs: linux-build + if: ${{!cancelled()}} + runs-on: [self-hosted, linux-a100] + timeout-minutes: 50 + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/121:/mnt/121 + - /mnt/104:/mnt/104 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Mark as start + run: | + chmod -R 777 ${{env.TEST_CODE_PATH}} + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + + benchmark: + needs: download_pkgs + if: ${{github.event_name == 'schedule' || !cancelled()}} + runs-on: [self-hosted, linux-a100] + strategy: + fail-fast: false + matrix: + benchmark_type: ${{fromJSON(github.event.inputs.benchmark_type)}} + gpu_num: ['gpu_num_1', 'gpu_num_2', 'gpu_num_4', 'gpu_num_8'] + include: + - n: 8 + gpu_num: gpu_num_1 + - n: 4 + gpu_num: gpu_num_2 + - n: 2 + gpu_num: gpu_num_4 + - n: 1 + gpu_num: gpu_num_8 + timeout-minutes: 480 + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme/huggingface_hub:/nvme/huggingface_hub + - /mnt/121:/mnt/121 + - /mnt/104:/mnt/104 + - /mnt/bigdisk:/mnt/bigdisk + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install transformers==4.57.6 + python3 -m pip list + lmdeploy check_env + - name: Run other benchmark - all + if: contains(fromJson(github.event.inputs.backend), 'turbomind') && contains(fromJson(github.event.inputs.backend), 'pytorch') + run: | + pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n ${{matrix.n}} -m '${{matrix.gpu_num}} and not pr_test and not function' --alluredir=${{env.ALLURE_REPORT_DIR}} + - name: Run other benchmark - turbomind + if: contains(fromJson(github.event.inputs.backend), 'turbomind') && !contains(fromJson(github.event.inputs.backend), 'pytorch') + run: | + pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n ${{matrix.n}} -m '${{matrix.gpu_num}} and not pr_test and not function and turbomind' --alluredir=${{env.ALLURE_REPORT_DIR}} + - name: Run other benchmark - pytorch + if: contains(fromJson(github.event.inputs.backend), 'pytorch') && !contains(fromJson(github.event.inputs.backend), 'turbomind') + run: | + pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n ${{matrix.n}} -m '${{matrix.gpu_num}} and not pr_test and not function and pytorch' --alluredir=${{env.ALLURE_REPORT_DIR}} + - name: Clear workfile + if: always() + run: | + echo "status=done" >> ${{env.REPORT_DIR}}/status.txt + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 525e9f4bea..18802033a9 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -48,7 +48,6 @@ env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy - FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt @@ -183,6 +182,7 @@ jobs: rm -rf ${{env.DEEPSEEK_VL}}/build - name: Check env run: | + pip install transformers==4.57.6 python3 -m pip list lmdeploy check_env rm -rf allure-results @@ -334,28 +334,13 @@ jobs: fail-fast: false matrix: backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} - model_path: ['internlm/Intern-S1', 'internlm/internlm2_5-20b-chat', 'internlm/internlm2_5-20b', 'Qwen/Qwen3-8B-Base', 'Qwen/Qwen3-30B-A3B', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', 'OpenGVLab/InternVL3-38B', 'Qwen/Qwen3-VL-8B-Instruct', 'Qwen/Qwen3-VL-30B-A3B-Instruct'] + model_path: ['Qwen/Qwen3-8B-Base', 'Qwen/Qwen3-30B-A3B', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', 'OpenGVLab/InternVL3-38B', 'Qwen/Qwen3-VL-8B-Instruct', 'Qwen/Qwen3-VL-30B-A3B-Instruct'] include: - - tp: 2 - model: internlm2_5-20b-chat - model_path: internlm/internlm2_5-20b-chat - case_info: ['chat_completions_v1', 'generate'] - generate_type: base - - tp: 2 - model: internlm2_5-20b - model_path: internlm/internlm2_5-20b - case_info: ['completions_v1'] - generate_type: base - tp: 2 model: Qwen3-8B-Base model_path: Qwen/Qwen3-8B-Base case_info: ['completions_v1'] generate_type: base - - tp: 8 - model: Intern-S1 - model_path: internlm/Intern-S1 - case_info: ['chat_completions_v1', 'generate'] - generate_type: base - tp: 2 model: Qwen3-30B-A3B model_path: Qwen/Qwen3-30B-A3B @@ -644,7 +629,7 @@ jobs: ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - name: Test benchmark script run: | - pytest autotest/benchmark -n 4 -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + pytest autotest/benchmark -n 4 -m function --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - name: Clear workfile if: always() diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml index fd64cfbdd2..c322a9fd20 100644 --- a/.github/workflows/daily_ete_test_3090.yml +++ b/.github/workflows/daily_ete_test_3090.yml @@ -53,7 +53,6 @@ env: OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt RUN_ID: ${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} - TEST_ENV: 3090 jobs: linux-build: @@ -153,6 +152,7 @@ jobs: PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + TEST_ENV: 3090_legacy container: image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" @@ -178,6 +178,7 @@ jobs: - name: Check env run: | python3 -m pip list + pip install transformers==4.57.6 lmdeploy check_env rm -rf allure-results # remove tmp log in testcase @@ -215,6 +216,7 @@ jobs: fail-fast: false matrix: backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + transformers: ["3090", "3090_legacy"] model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}} function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}} exclude: @@ -228,6 +230,7 @@ jobs: PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + TEST_ENV: ${{matrix.transformers}} container: image: openmmlab/lmdeploy:latest-cu12 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" @@ -249,6 +252,10 @@ jobs: run: | python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt + - name: Downgrade transformers + if: ${{matrix.transformers == '3090_legacy'}} + run: | + pip install transformers==4.57.6 - name: Check env run: | python3 -m pip list @@ -294,6 +301,7 @@ jobs: fail-fast: false matrix: backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + transformers: ["3090", "3090_legacy"] model_path: ['internlm/internlm3-8b-instruct', 'Qwen/Qwen3-8B'] include: - tp: 1 @@ -315,6 +323,8 @@ jobs: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/qa_test_models:/nvme/qa_test_models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + env: + TEST_ENV: ${{matrix.transformers}} steps: - name: Copy repository and Artifacts run: | @@ -328,6 +338,10 @@ jobs: run: | python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt + - name: Downgrade transformers + if: ${{matrix.transformers == '3090_legacy'}} + run: | + pip install transformers==4.57.6 - name: Check env run: | python3 -m pip list diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml index 093463db1e..f32d5f0590 100644 --- a/.github/workflows/daily_ete_test_5080.yml +++ b/.github/workflows/daily_ete_test_5080.yml @@ -53,7 +53,6 @@ env: OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt RUN_ID: ${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} - TEST_ENV: 5080 jobs: linux-build: @@ -153,6 +152,7 @@ jobs: PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + TEST_ENV: 5080_legacy container: image: openmmlab/lmdeploy:latest-cu12.8 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" @@ -177,6 +177,7 @@ jobs: python3 -m pip install -r requirements/test.txt - name: Check env run: | + pip install transformers==4.57.6 for i in $(seq 1 10); do output=$(lmdeploy check_env 2>&1) if echo "$output" | grep -q "CUDA available: False"; then @@ -225,6 +226,7 @@ jobs: matrix: backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}} + transformers: ["5080", "5080_legacy"] function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}} exclude: - backend: turbomind @@ -237,6 +239,7 @@ jobs: PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + TEST_ENV: ${{ matrix.transformers }} container: image: openmmlab/lmdeploy:latest-cu12.8 options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" @@ -258,6 +261,10 @@ jobs: run: | python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt + - name: Downgrade transformers + if: ${{matrix.transformers == '5080_legacy'}} + run: | + pip install transformers==4.57.6 - name: Check env run: | for i in $(seq 1 10); do @@ -313,6 +320,7 @@ jobs: matrix: backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} model_path: ['meta-llama/Llama-3.2-3B-Instruct', 'Qwen/Qwen3-4B'] + transformers: ["5080", "5080_legacy"] include: - tp: 1 model: Llama-3.2-3B-Instruct @@ -334,6 +342,8 @@ jobs: - /nvme/qa_test_models:/nvme/qa_test_models - /mnt/3090:/mnt/3090 - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + env: + TEST_ENV: ${{ matrix.transformers }} steps: - name: Copy repository and Artifacts run: | @@ -347,6 +357,10 @@ jobs: run: | python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt + - name: Downgrade transformers + if: ${{matrix.transformers == '5080_legacy'}} + run: | + pip install transformers==4.57.6 - name: Check env run: | for i in $(seq 1 10); do diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml deleted file mode 100644 index b61123f6ef..0000000000 --- a/.github/workflows/daily_ete_test_h800.yml +++ /dev/null @@ -1,355 +0,0 @@ -name: daily_ete_test_h800 - -on: - workflow_dispatch: - inputs: - repo_org: - required: false - description: 'Tested repository organization name. Default is InternLM' - type: string - default: 'InternLM/lmdeploy' - repo_ref: - required: false - description: 'Set branch or tag or commit id. Default is "main"' - type: string - default: 'main' - backend: - required: true - description: 'Set backend filter. Default is "["turbomind", "pytorch"]"' - type: string - default: "['turbomind', 'pytorch']" - model: - required: true - description: 'Set testcase module filter: llm, mllm. Default contains all models' - type: string - default: "['llm','mllm']" - function: - required: true - description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' - type: string - default: '["pipeline", "restful", "chat"]' - offline_mode: - required: true - description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' - type: boolean - default: false - regression_func: - required: true - description: 'regression functions' - type: string - default: "['tools','restful']" - schedule: - - cron: '00 14 * * 0-4' - -env: - HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache - HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai - OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }} - ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true - REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} - COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy - FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}} - TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} - OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy - OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt - RUN_ID: ${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} - TEST_ENV: h800 - -jobs: - linux-build: - if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} - strategy: - matrix: - pyver: [py310] - runs-on: ubuntu-latest - env: - PYTHON_VERSION: ${{ matrix.pyver }} - PLAT_NAME: manylinux2014_x86_64 - DOCKER_TAG: cuda12.8 - steps: - - name: Free disk space - uses: jlumbroso/free-disk-space@main - with: - # This might remove tools that are actually needed, if set to "true" but frees about 6 GB - tool-cache: false - docker-images: false - # All of these default to true, but feel free to set to "false" if necessary for your workflow - android: true - dotnet: true - haskell: true - large-packages: true - swap-storage: false - - name: Checkout repository - uses: actions/checkout@v3 - with: - repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Build - run: | - echo ${PYTHON_VERSION} - echo ${PLAT_NAME} - echo ${DOCKER_TAG} - echo ${OUTPUT_FOLDER} - echo ${GITHUB_RUN_ID} - # remove -it - sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh - bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} - - name: Upload Artifacts - uses: actions/upload-artifact@v4 - with: - if-no-files-found: error - path: builder/manywheel/${{ env.OUTPUT_FOLDER }} - retention-days: 1 - name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} - - - download_pkgs: - needs: linux-build - if: ${{!cancelled()}} - runs-on: [self-hosted, h800-r1] - timeout-minutes: 50 - container: - image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/qa_test_models:/nvme/qa_test_models - - /nvme1/qa_test_models:/nvme1/qa_test_models - - /nvme2/share:/nvme2/share - - /mnt/137_nvme4:/mnt/137_nvme4 - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - steps: - - name: Clone repository - uses: actions/checkout@v2 - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - with: - repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Copy repository - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} - - name: Copy repository - offline - if: ${{inputs.offline_mode}} - run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} - - name: Download Artifacts - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - uses: actions/download-artifact@v4 - with: - name: my-artifact-${{ github.run_id }}-py310 - - name: Copy Artifacts - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} - - name: Copy Artifacts - offline - if: ${{inputs.offline_mode}} - run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} - - name: Mark as start - run: | - mkdir ${{env.REPORT_DIR}} -p - echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt - - test_tools: - if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} - runs-on: [self-hosted, h800-r1] - needs: download_pkgs - timeout-minutes: 300 - strategy: - fail-fast: false - matrix: - backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} - model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}} - function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}} - exclude: - - backend: turbomind - model: mllm - function: chat - - backend: pytorch - model: mllm - function: chat - env: - PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA - MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub - MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules - container: - image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/qa_test_models:/nvme/qa_test_models - - /nvme1/qa_test_models:/nvme1/qa_test_models - - /nvme2/share:/nvme2/share - - /mnt/137_nvme2:/mnt/137_nvme2 - - /mnt/137_nvme3:/mnt/137_nvme3 - - /mnt/137_nvme4:/mnt/137_nvme4 - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - steps: - - name: Copy repository and Artifacts - run: | - cp -r ${{env.TEST_CODE_PATH}}/. . - - name: Install lmdeploy - dependency - run: | - python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - - name: Install lmdeploy - run: | - python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps - python3 -m pip install -r requirements/test.txt - - name: Check env - run: | - python3 -m pip list - lmdeploy check_env - rm -rf allure-results - # remove tmp log in testcase - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f - ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - - name: Test lmdeploy - chat - continue-on-error: true - if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' - run: | - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Test lmdeploy - pipeline - continue-on-error: true - if: matrix.function == 'pipeline' - run: | - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Test lmdeploy - restful - continue-on-error: true - if: matrix.function == 'restful' - run: | - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Clear workfile - if: always() - run: | - chmod -R 777 $REPORT_DIR - export workdir=$(pwd) - cd .. - rm -rf $workdir - mkdir $workdir - chmod -R 777 $workdir - - test_restful: - if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} - runs-on: [self-hosted, h800-r1] - needs: download_pkgs - strategy: - fail-fast: false - matrix: - backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} - model: ['Intern-S1'] - include: - - tp: 8 - model: Intern-S1 - timeout-minutes: 60 - container: - image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/qa_test_models:/nvme/qa_test_models - - /nvme1/qa_test_models:/nvme1/qa_test_models - - /nvme2/share:/nvme2/share - - /mnt/137_nvme2:/mnt/137_nvme2 - - /mnt/137_nvme3:/mnt/137_nvme3 - - /mnt/137_nvme4:/mnt/137_nvme4 - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - steps: - - name: Copy repository and Artifacts - run: | - cp -r ${{env.TEST_CODE_PATH}}/. . - - name: Install lmdeploy - dependency - run: | - python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} - - name: Install lmdeploy - run: | - python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps - python3 -m pip install -r requirements/test.txt - - name: Check env - run: | - python3 -m pip list - lmdeploy check_env - rm -rf allure-results - # remove tmp log in testcase - mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f - ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest - - name: Start restful api - if: matrix.model != 'internlm2_5-20b' - run: | - lmdeploy serve api_server /nvme/qa_test_models/internlm/${{matrix.model}} --tp ${{matrix.tp}} --backend ${{matrix.backend}} > ${{env.REPORT_DIR}}/${{matrix.backend}}_${{matrix.model}}_start_chat_restful.log 2>&1 & - echo "restful_pid=$!" >> "$GITHUB_ENV" - sleep 900s - - name: Test lmdeploy - restful api - if: matrix.model == 'Intern-S1' - timeout-minutes: 30 - run: | - pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}} and not internlm2_5' --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.model}}-${{matrix.backend}}_ ${{env.COV_PARAM}} || true - mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') - - name: Kill api server - if: matrix.model != 'internlm2_5-20b' - run: | - kill -15 "$restful_pid" - - name: Clear workfile - if: always() - run: | - chmod -R 777 $REPORT_DIR - chmod -R 777 ${{env.REPORT_DIR}} - export workdir=$(pwd) - cd .. - rm -rf $workdir - mkdir $workdir - chmod -R 777 $workdir - - get_coverage_report: - if: ${{!cancelled()}} - runs-on: [self-hosted, h800-r1] - needs: [test_tools, test_restful] - timeout-minutes: 5 - container: - image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/qa_test_models:/nvme/qa_test_models - - /nvme1/qa_test_models:/nvme1/qa_test_models - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - steps: - - name: Copy repository and Artifacts - run: cp -r ${{env.TEST_CODE_PATH}}/. . - - name: Install lmdeploy - run: | - echo "status=done" >> ${{env.REPORT_DIR}}/status.txt - python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps - python3 -m pip install -r requirements/test.txt - - name: Get coverage report - run: | - pip install coverage - coverage combine ${{env.REPORT_DIR}} - coverage xml -o ${{env.REPORT_DIR}}/coverage.xml - coverage report -m - mv .coverage ${{env.REPORT_DIR}}/.coverage - - name: Clear workfile - if: always() - run: | - chmod -R 777 $REPORT_DIR - export workdir=$(pwd) - cd .. - rm -rf $workdir - mkdir $workdir - chmod -R 777 $workdir diff --git a/.github/workflows/daily_ete_test_legacy.yml b/.github/workflows/daily_ete_test_legacy.yml new file mode 100644 index 0000000000..bcaefbc244 --- /dev/null +++ b/.github/workflows/daily_ete_test_legacy.yml @@ -0,0 +1,561 @@ +name: daily_ete_test_legacy + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend filter. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + model: + required: true + description: 'Set testcase module filter: llm, mllm. Default contains all models' + type: string + default: "['llm','mllm']" + function: + required: true + description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions' + type: string + default: '["pipeline", "restful", "chat"]' + offline_mode: + required: true + description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' + type: boolean + default: false + regression_func: + required: true + description: 'regression functions' + type: string + default: "['quant', 'tools','restful','pipeline','evaluation']" + schedule: + - cron: '00 14 * * 0-4' + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }} + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt + DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL + RUN_ID: ${{ inputs.repo_ref || 'main' }}_${{ github.run_id }} + TEST_ENV: legacy + +jobs: + linux-build: + if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.8 + steps: + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + # This might remove tools that are actually needed, if set to "true" but frees about 6 GB + tool-cache: false + docker-images: false + # All of these default to true, but feel free to set to "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: false + - name: Checkout repository + uses: actions/checkout@v6 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + + download_pkgs: + needs: linux-build + if: ${{!cancelled()}} + runs-on: [self-hosted, linux-a100] + timeout-minutes: 50 + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Mark as start + run: | + chmod -R 777 ${{env.TEST_CODE_PATH}} + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + + test_quantization: + needs: download_pkgs + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}} + runs-on: [self-hosted, linux-a100] + timeout-minutes: 150 + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub + MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme/huggingface_hub:/nvme/huggingface_hub + - /mnt/121:/mnt/121 + - /mnt/104:/mnt/104 + - /mnt/bigdisk:/mnt/bigdisk + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + - name: Install lmdeploy - dependency + run: | + python3 -m pip install auto_gptq matplotlib attrdict + python3 -m pip install -r requirements/lite.txt + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + pip install ${{env.DEEPSEEK_VL}} --no-deps + rm -rf ${{env.DEEPSEEK_VL}}/build + - name: Check env + run: | + pip install transformers==4.57.6 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - quantization w4a16 + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind') + run: | + pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - quantization w8a8 + continue-on-error: true + if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'pytorch') + run: | + pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + echo "status=done" >> ${{env.REPORT_DIR}}/status.txt + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_tools: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}} + runs-on: [self-hosted, linux-a100] + needs: test_quantization + timeout-minutes: 300 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}} + function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}} + exclude: + - backend: turbomind + model: mllm + function: chat + - backend: pytorch + model: mllm + function: chat + include: + - backend: turbomind + model: llm + function: other + env: + PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA + MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub + MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme/huggingface_hub:/nvme/huggingface_hub + - /mnt/121:/mnt/121 + - /mnt/104:/mnt/104 + - /mnt/bigdisk:/mnt/bigdisk + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + pip install ${{env.DEEPSEEK_VL}} --no-deps + rm -rf ${{env.DEEPSEEK_VL}}/build + - name: Check env + run: | + pip install transformers==4.57.6 + python3 -m pip list + lmdeploy check_env + cp -r /nvme/qa_test_models/offline_pkg/lora . + rm -rf allure-results + # remove tmp log in testcase + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - chat + continue-on-error: true + if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat' + run: | + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - pipeline + continue-on-error: true + if: matrix.function == 'pipeline' + run: | + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - restful + continue-on-error: true + if: matrix.function == 'restful' + run: | + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - local testcase + if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'other' + run: | + pytest autotest/toolchain --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + echo "status=done" >> ${{env.REPORT_DIR}}/status.txt + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_restful: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}} + runs-on: [self-hosted, linux-a100] + needs: test_quantization + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + model_path: ['internlm/Intern-S1'] + include: + - tp: 8 + model: Intern-S1 + model_path: internlm/Intern-S1 + case_info: ['chat_completions_v1', 'generate'] + generate_type: base + timeout-minutes: 60 + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme/huggingface_hub:/nvme/huggingface_hub + - /mnt/121:/mnt/121 + - /mnt/104:/mnt/104 + - /mnt/bigdisk:/mnt/bigdisk + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Check env + run: | + pip install transformers==4.57.6 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Start restful api + run: | + lmdeploy serve api_server /nvme/qa_test_models/${{matrix.model_path}} --tp ${{matrix.tp}} --backend ${{matrix.backend}} ${{matrix.extra}} --allow-terminate-by-client > ${{env.REPORT_DIR}}/${{matrix.backend}}_${{matrix.model}}_${{matrix.generate_type}}_start_restful.log 2>&1 & + echo "restful_pid=$!" + for i in $(seq 1 240) + do + sleep 5 + echo "health check try $i" + if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then + echo "health check success" + exit 0 + fi + done + + echo "health check fail" + curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 + exit 1 + - name: Test lmdeploy - chat_completions_v1 + if: matrix.model != 'internlm2_5-20b-chat' && matrix.model != 'Intern-S1' && contains(matrix.case_info, 'chat_completions_v1') + timeout-minutes: 60 + run: | + pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}} and not internlm2_5 and not interns1' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - chat_completions_v1 + if: matrix.model == 'Intern-S1' && contains(matrix.case_info, 'chat_completions_v1') + timeout-minutes: 60 + run: | + pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}} and not internlm2_5' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - chat_completions_v1 - internlm2_5-20b-chat + if: matrix.model == 'internlm2_5-20b-chat' && contains(matrix.case_info, 'chat_completions_v1') + timeout-minutes: 60 + run: | + pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}} and not interns1' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - completions_v1 - internlm2_5-20b + if: matrix.model == 'internlm2_5-20b' && contains(matrix.case_info, 'completions_v1') + timeout-minutes: 60 + run: | + pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test lmdeploy - completions_v1 - other + if: matrix.model != 'internlm2_5-20b' && contains(matrix.case_info, 'completions_v1') + timeout-minutes: 60 + run: | + pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}} and not internlm2_5' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test generate - base + if: matrix.generate_type == 'base' && contains(matrix.case_info, 'generate') + timeout-minutes: 60 + run: | + pytest autotest/interface/restful/test_restful_generate.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}} and not logprob and not experts' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test generate - logprob + if: matrix.generate_type == 'logprob' && contains(matrix.case_info, 'generate') + timeout-minutes: 60 + run: | + pytest autotest/interface/restful/test_restful_generate.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}} and not experts' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Test generate - all + if: matrix.generate_type == 'all' && contains(matrix.case_info, 'generate') + timeout-minutes: 60 + run: | + pytest autotest/interface/restful/test_restful_generate.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Kill api server + if: always() + run: | + curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1 + - name: Clear workfile + if: always() + run: | + echo "status=done" >> ${{env.REPORT_DIR}}/status.txt + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + test_pipeline: + if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'pipeline'))}} + runs-on: [self-hosted, linux-a100] + needs: test_quantization + timeout-minutes: 240 + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme/huggingface_hub:/nvme/huggingface_hub + - /mnt/121:/mnt/121 + - /mnt/104:/mnt/104 + - /mnt/bigdisk:/mnt/bigdisk + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}} + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + pip install ${{env.DEEPSEEK_VL}} --no-deps + rm -rf ${{env.DEEPSEEK_VL}}/build + - name: Check env + run: | + pip install transformers==4.57.6 + python3 -m pip list + lmdeploy check_env + rm -rf allure-results + # remove tmp log in testcase + mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f + ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest + - name: Test lmdeploy - interface pipeline case + run: | + pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_8 and not pr_test' -n 1 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true + mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') + - name: Clear workfile + if: always() + run: | + echo "status=done" >> ${{env.REPORT_DIR}}/status.txt + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir + + get_coverage_report: + if: ${{!cancelled()}} + runs-on: [self-hosted, linux-a100] + needs: [test_tools, test_restful, test_pipeline] + timeout-minutes: 5 + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/qa_test_models:/nvme/qa_test_models + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: cp -r ${{env.TEST_CODE_PATH}}/. . + - name: Install lmdeploy + run: | + echo "status=done" >> ${{env.REPORT_DIR}}/status.txt + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Get coverage report + run: | + pip install coverage + coverage combine ${{env.REPORT_DIR}} + coverage xml -o ${{env.REPORT_DIR}}/coverage.xml + coverage report -m + mv .coverage ${{env.REPORT_DIR}}/.coverage + - name: Clear workfile + if: always() + run: | + chmod -R 777 $REPORT_DIR + export workdir=$(pwd) + cd .. + rm -rf $workdir + mkdir $workdir + chmod -R 777 $workdir diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 0d87c58ccf..a1c31d2c03 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -15,9 +15,9 @@ on: default: 'main' base_models: required: true - description: 'Tested TurboMind models list. eg. [turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_internlm2_5_7b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_internlm2_5_7b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b]' + description: 'Tested TurboMind models list. eg. [turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]' type: string - default: '[turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_internlm2_5_7b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_internlm2_5_7b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b]' + default: '[turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]' baes_datasets: required: true description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]' @@ -69,7 +69,7 @@ jobs: large-packages: true swap-storage: false - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v6 with: repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} ref: ${{github.event.inputs.repo_ref || 'main'}} diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml deleted file mode 100644 index ba10bc78fe..0000000000 --- a/.github/workflows/evaluate_h800.yml +++ /dev/null @@ -1,166 +0,0 @@ -name: evaluate_h800 - -on: - workflow_dispatch: - inputs: - repo_org: - required: false - description: 'Tested repository organization name. Default is InternLM/lmdeploy' - type: string - default: 'InternLM/lmdeploy' - repo_ref: - required: false - description: 'Set branch or tag or commit id. Default is "main"' - type: string - default: 'main' - base_models: - required: true - description: 'Tested TurboMind models list. eg. [turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]' - type: string - default: '[turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]' - baes_datasets: - required: true - description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]' - type: string - default: '[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]' - oc_repo_org: - required: false - description: 'Tested repository organization name. Default is open-compass/opencompass' - type: string - default: 'open-compass/opencompass' - oc_repo_ref: - required: false - description: 'Set branch or tag or commit id. Default is "main"' - type: string - default: 'main' - offline_mode: - required: true - description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' - type: boolean - default: false - -env: - ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true - COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache - -jobs: - linux-build: - if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}} - strategy: - matrix: - pyver: [py310] - runs-on: ubuntu-latest - env: - PYTHON_VERSION: ${{ matrix.pyver }} - PLAT_NAME: manylinux2014_x86_64 - DOCKER_TAG: cuda12.8 - OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }} - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Build - run: | - echo ${PYTHON_VERSION} - echo ${PLAT_NAME} - echo ${DOCKER_TAG} - echo ${OUTPUT_FOLDER} - echo ${GITHUB_RUN_ID} - # remove -it - sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh - bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} - - name: Upload Artifacts - uses: actions/upload-artifact@v4 - with: - if-no-files-found: error - path: builder/manywheel/${{ env.OUTPUT_FOLDER }} - retention-days: 1 - name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} - - evaluate: - needs: linux-build - if: ${{github.event_name == 'schedule' || !cancelled()}} - runs-on: [self-hosted, h800-r1] - timeout-minutes: 4320 # 72hours - strategy: - fail-fast: false - matrix: - evaluate_type: ['base'] - container: - image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" - volumes: - - /nvme/github-actions/pip-cache:/root/.cache/pip - - /nvme/github-actions/packages:/root/packages - - /nvme/github-actions/resources:/root/resources - - /nvme/qa_test_models:/nvme/qa_test_models - - /nvme1/qa_test_models:/nvme1/qa_test_models - - /nvme2/share:/nvme2/share - - /mnt/137_nvme2:/mnt/137_nvme2 - - /mnt/137_nvme3:/mnt/137_nvme3 - - /mnt/137_nvme4:/mnt/137_nvme4 - - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro - steps: - - name: Setup systems - run: | - export TIME_STAMP="$(date +'%Y%m%d-%H%M%S')" - echo "TIME_STAMP=$TIME_STAMP" >> $GITHUB_ENV - - name: Clone repository - uses: actions/checkout@v2 - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - with: - repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Copy repository - offline - if: ${{inputs.offline_mode}} - run: cp -r /root/models/offline_pkg/lmdeploy/. . - - name: Download Artifacts - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - uses: actions/download-artifact@v4 - with: - name: my-artifact-${{ github.run_id }}-py310 - - name: Install lmdeploy - dependency - run: | - python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt - - name: Install lmdeploy - if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} - run: | - python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps - python3 -m pip install -r requirements/test.txt - - name: Install lmdeploy - offline - if: ${{inputs.offline_mode}} - run: | - python3 -m pip install /root/models/offline_pkg/py310/lmdeploy-*.whl --no-deps - python3 -m pip install -r requirements/test.txt - - name: Install opencompass - run: | - git clone https://github.com/${{ github.event.inputs.oc_repo_org}}.git - cd opencompass - git checkout ${{ github.event.inputs.oc_repo_ref}} - python3 -m pip install . - echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV - - name: Check env - run: | - python3 -m pip list - lmdeploy check_env - - name: Setup paths for evaluation - run: | - ln -s /nvme/qa_test_models/opencompass-data ./data - python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models . - - name: Evaluate base models - if: matrix.evaluate_type == 'base' - run: | - echo ${{github.event.inputs.base_models}} - echo ${{github.event.inputs.baes_datasets}} - export LMDEPLOY_DIR=$(pwd) - python3 .github/scripts/action_tools.py evaluate "${{github.event.inputs.base_models}}" "${{github.event.inputs.baes_datasets}}" /root/evaluation_report/${{ github.run_id }} base - - name: Clear workspace - if: always() - run: | - export workdir=$(pwd) - cd .. - rm -rf $workdir - mkdir $workdir - chmod -R 777 $workdir diff --git a/.github/workflows/evaluate_remote.yml b/.github/workflows/evaluate_remote.yml deleted file mode 100644 index 200fea5983..0000000000 --- a/.github/workflows/evaluate_remote.yml +++ /dev/null @@ -1,217 +0,0 @@ -name: evaluate_remote - -on: - workflow_dispatch: - inputs: - repo_org: - required: false - description: 'Tested repository organization name. Default is open-compass/opencompass' - type: string - default: 'open-compass/opencompass' - repo_ref: - required: false - description: 'Set branch or tag or commit id. Default is "main"' - type: string - default: 'main' - build_lmdeploy: - required: false - description: 'whether to build lmdeploy' - type: boolean - default: true - repo_org_lmdeploy: - required: false - description: 'Tested repository organization name. Default is internlm/lmdeploy' - type: string - default: 'InternLM/lmdeploy' - repo_ref_lmdeploy: - required: false - description: 'Set branch or tag or commit id. Default is "main"' - type: string - default: 'main' - regression_func_volc: - required: true - description: 'regression functions' - type: string - default: "['chat_models','base_models']" - backend: - required: true - description: 'Set backend filter. Default is "["turbomind", "pytorch"]"' - type: string - default: "['turbomind', 'pytorch']" - -env: - HF_DATASETS_OFFLINE: 1 - HF_EVALUATE_OFFLINE: 1 - TRANSFORMERS_OFFLINE: 1 - LMDEPLOY_USE_MODELSCOPE: false - HF_HUB_OFFLINE: 1 - OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }} - CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3 - PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip - REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/lmdeploy_regression - COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache - HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub - HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub - CONDA_ENV: lmdeploy_test - -jobs: - build-pypi: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Set up Python 3.10 - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - name: Build Opencompass - run: | - pip install wheel setuptools - python setup.py sdist bdist_wheel - - name: Upload Artifacts - uses: actions/upload-artifact@v4 - with: - if-no-files-found: error - path: dist/* - retention-days: 1 - name: my-artifact-${{ github.run_id }} - - build-pypi-lmdeploy: - if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.build_lmdeploy)}} - strategy: - matrix: - pyver: [py310] - runs-on: ubuntu-latest - env: - PYTHON_VERSION: ${{ matrix.pyver }} - PLAT_NAME: manylinux2014_x86_64 - DOCKER_TAG: cuda12.4 - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }} - ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}} - - name: Build - run: | - echo ${PYTHON_VERSION} - echo ${PLAT_NAME} - echo ${DOCKER_TAG} - echo ${OUTPUT_FOLDER} - echo ${GITHUB_RUN_ID} - # remove -it - sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh - bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} - - name: Upload Artifacts - uses: actions/upload-artifact@v4 - with: - if-no-files-found: error - path: builder/manywheel/${{ env.OUTPUT_FOLDER }} - retention-days: 1 - name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} - - - prepare_env: - if: ${{!cancelled()}} - needs: ['build-pypi', 'build-pypi-lmdeploy'] - runs-on: lmdeploy-volc - timeout-minutes: 120 #2hours - steps: - - name: Clone repository - uses: actions/checkout@v2 - with: - repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }} - ref: ${{github.event.inputs.repo_ref || 'main'}} - - name: Download Artifacts - uses: actions/download-artifact@v4 - with: - name: my-artifact-${{ github.run_id }} - - name: Remove Conda Env - if: always() - run: | - . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate - conda env remove -y --name ${{env.CONDA_ENV}} - conda info --envs - - name: Prepare - create conda env and install torch - cu12 - uses: nick-fields/retry@v3 - with: - max_attempts: 1 - timeout_minutes: 120 - command: | - . ${{env.CONDA_PATH}}/bin/activate - conda create -y --name ${{env.CONDA_ENV}} python=3.10 - conda activate ${{env.CONDA_ENV}} - unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy; - pip install -r /fs-computility/llm/qa-llm-cicd/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} - pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}} - pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} - FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl - cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data - - name: Prepare - download lmdeploy - cu12 - if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}} - uses: actions/download-artifact@v4 - with: - name: my-artifact-${{ github.run_id }}-py310 - - name: Prepare - reinstall lmdeploy - cu12 - if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}} - run: | - . ${{env.CONDA_PATH}}/bin/activate - conda activate ${{env.CONDA_ENV}} - pip uninstall -y lmdeploy - pip install lmdeploy-*.whl --no-deps - - name: conda env - run: | - . ${{env.CONDA_PATH}}/bin/activate - conda activate ${{env.CONDA_ENV}} - conda info --envs - pip list - - run_test_volc: - if: ${{!cancelled()}} - needs: prepare_env - strategy: - fail-fast: false - matrix: - regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models"]')}} - backend: ${{ fromJSON(github.event.inputs.backend || '["turbomind", "pytorch"]')}} - runs-on: lmdeploy-volc - timeout-minutes: 480 #6hours - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }} - ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}} - - name: modify config - run: | - mkdir opencompass - cp .github/scripts/eval_regression_${{matrix.regression_func}}.py opencompass/eval_regression_${{matrix.regression_func}}.py - cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py opencompass - cat /fs-computility/llm/qa-llm-cicd/config/lmdeploy_test_config.txt >> opencompass/eval_regression_${{matrix.regression_func}}.py - - name: modify config - models filter - if: matrix.backend == 'turbomind' - run: | - echo "models = sum([v for k, v in locals().items() if k.startswith('lmdeploy_')], [])" >> opencompass/eval_regression_${{matrix.regression_func}}.py - - name: modify config - models filter - if: matrix.backend == 'pytorch' - run: | - echo "models = sum([v for k, v in locals().items() if k.startswith('pytorch_')], [])" >> opencompass/eval_regression_${{matrix.regression_func}}.py - - name: Run test - uses: nick-fields/retry@v3 - with: - max_attempts: 1 - timeout_minutes: 180 - command: | - . ${{env.CONDA_PATH}}/bin/activate - conda activate ${{env.CONDA_ENV}} - conda info --envs - unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy; - echo "models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])" >> opencompass/eval_regression_${{matrix.regression_func}}.py - cd opencompass - opencompass eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details - cd .. - python .github/scripts/action_tools.py generate_output_for_evaluation ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} diff --git a/.github/workflows/mllm_api_eval.yml b/.github/workflows/mllm_api_eval.yml index a9b7921c8e..75220d794b 100644 --- a/.github/workflows/mllm_api_eval.yml +++ b/.github/workflows/mllm_api_eval.yml @@ -40,7 +40,6 @@ env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true REPORT_DIR: /nvme/qa_test_models/mllm_evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }} COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy - FAIL_CONFIG: '--lf' TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }} OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt diff --git a/.github/workflows/mllm_api_eval_legacy.yml b/.github/workflows/mllm_api_eval_legacy.yml new file mode 100644 index 0000000000..86fd787b0a --- /dev/null +++ b/.github/workflows/mllm_api_eval_legacy.yml @@ -0,0 +1,216 @@ +name: mllm_api_eval_legacy + +on: + workflow_dispatch: + inputs: + repo_org: + required: false + description: 'Tested repository organization name. Default is InternLM/lmdeploy' + type: string + default: 'InternLM/lmdeploy' + repo_ref: + required: false + description: 'Set branch or tag or commit id. Default is "main"' + type: string + default: 'main' + backend: + required: true + description: 'Set backend filter. Default is "["turbomind", "pytorch"]"' + type: string + default: "['turbomind', 'pytorch']" + execution_mode: + required: false + description: 'Select execution mode: infer, eval, or both. Default is "both"' + type: choice + options: + - both + - infer + - eval + default: 'both' + run_id: + required: false + description: 'Set custom run ID. If not provided, github.run_id will be used' + type: string + default: '' + + +env: + HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache + HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + REPORT_DIR: /nvme/qa_test_models/mllm_evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }} + COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy + TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }} + OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy + OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt + DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL + LMUData: /nvme/qa_test_models/LMUData + LOCAL_LLM: turbomind_Qwen2.5-32B-Instruct_nccl_tp2_0 + OPENAI_API_KEY: sk-empty + HF_DATASETS_OFFLINE: 1 + HF_DATASETS_CACHE: /nvme/qa_test_models/hf_datasets + HF_HUB_OFFLINE: 1 + HF_EVALUATE_OFFLINE: 1 + RUN_ID: ${{ inputs.repo_ref }}_${{ github.run_id }} + TEST_ENV: legacy + +jobs: + linux-build: + if: ${{ !cancelled() }} + strategy: + matrix: + pyver: [py310] + runs-on: ubuntu-latest + env: + PYTHON_VERSION: ${{ matrix.pyver }} + PLAT_NAME: manylinux2014_x86_64 + DOCKER_TAG: cuda12.8 + OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }} + steps: + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + # This might remove tools that are actually needed, if set to "true" but frees about 6 GB + tool-cache: false + docker-images: false + # All of these default to true, but feel free to set to "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + swap-storage: false + - name: Checkout repository + uses: actions/checkout@v3 + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Build + run: | + echo ${PYTHON_VERSION} + echo ${PLAT_NAME} + echo ${DOCKER_TAG} + echo ${OUTPUT_FOLDER} + echo ${GITHUB_RUN_ID} + # remove -it + sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh + bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + with: + if-no-files-found: error + path: builder/manywheel/${{ env.OUTPUT_FOLDER }} + retention-days: 1 + name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }} + + download_pkgs: + needs: linux-build + if: ${{!cancelled()}} + runs-on: [self-hosted, linux-a100] + timeout-minutes: 50 + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/qa_test_models:/nvme/qa_test_models + - /mnt/121:/mnt/121 + - /mnt/104:/mnt/104 + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Clone repository + uses: actions/checkout@v2 + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + with: + repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} + ref: ${{github.event.inputs.repo_ref || 'main'}} + - name: Copy repository + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}} + - name: Copy repository - offline + if: ${{inputs.offline_mode}} + run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}} + - name: Download Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + uses: actions/download-artifact@v4 + with: + name: my-artifact-${{ github.run_id }}-py310 + - name: Copy Artifacts + if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Copy Artifacts - offline + if: ${{inputs.offline_mode}} + run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}} + - name: Mark as start + run: | + chmod -R 777 ${{env.TEST_CODE_PATH}} + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + + test_evaluation: + needs: download_pkgs + if: ${{ !cancelled() }} + runs-on: [self-hosted, linux-a100] + timeout-minutes: 2400 + strategy: + fail-fast: false + matrix: + backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}} + gpu_num: ['gpu_num_1', 'gpu_num_2', 'gpu_num_4', 'gpu_num_8'] + container: + image: openmmlab/lmdeploy:latest-cu12.8 + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never" + volumes: + - /nvme/github-actions/pip-cache:/root/.cache/pip + - /nvme/github-actions/packages:/root/packages + - /nvme/github-actions/resources:/root/resources + - /nvme/qa_test_models:/nvme/qa_test_models + - /nvme/huggingface_hub:/nvme/huggingface_hub + - /mnt/121:/mnt/121 + - /mnt/104:/mnt/104 + - /mnt/bigdisk:/mnt/bigdisk + - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro + steps: + - name: Copy repository and Artifacts + run: | + cp -r ${{env.TEST_CODE_PATH}}/. . + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + - name: Install lmdeploy - dependency + run: | + python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt + - name: Install lmdeploy + run: | + python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps + python3 -m pip install -r requirements/test.txt + - name: Install vlmeval + run: | + python3 -m pip install pandas datasets scikit-learn pylatexenc math_verify + apt update && apt install -y libgl1 libglib2.0-0 + cp -r /nvme/qa_test_models/offline_pkg/VLMEvalKit . + cd VLMEvalKit && pip install . + - name: Check env + run: | + pip install transformers==4.57.6 + python3 -m pip list + lmdeploy check_env + mkdir ${{env.REPORT_DIR}} -p + echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt + - name: Setup paths for evaluation + if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') + run: | + unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy; + cd VLMEvalKit && cp -r ../autotest . + execution_mode="${{ github.event.inputs.execution_mode || 'both' }}" + ulimit -n 65535 + if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then + pytest autotest/evaluate/test_mllm_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and infer" --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + fi + if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then + pytest autotest/evaluate/test_mllm_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and eval" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$? + fi + exit $overall_exit + - name: Clear workspace + if: always() + run: | + echo "status=done" >> ${{env.REPORT_DIR}}/status.txt + export workdir=$(pwd) + rm -rf $workdir/* diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml index 328e791dc4..8e567800c7 100644 --- a/.github/workflows/pr_ete_test.yml +++ b/.github/workflows/pr_ete_test.yml @@ -69,7 +69,7 @@ jobs: pip install transformers==4.57.3 - name: Test restful server - turbomind Qwen3-32B run: | - CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-32B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/turbomind_Qwen3-32B_start_restful.log 2>&1 & + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-32B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/turbomind_Qwen3-32B_start_restful.log 2>&1 & echo "restful_pid=$!" for i in $(seq 1 180) do @@ -89,7 +89,7 @@ jobs: exit 1 - name: Test restful server - turbomind InternVL3-38B run: | - CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3-38B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/turbomind_InternVL3-38B_start_restful.log 2>&1 & + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3-38B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/turbomind_InternVL3-38B_start_restful.log 2>&1 & echo "restful_pid=$!" for i in $(seq 1 180) do @@ -109,7 +109,7 @@ jobs: exit 1 - name: Test restful server - turbomind Qwen3-30B-A3B run: | - CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client> ${{env.SERVER_LOG}}/turbomind_Qwen3-30B-A3B_start_restful.log 2>&1 & + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client> ${{env.SERVER_LOG}}/turbomind_Qwen3-30B-A3B_start_restful.log 2>&1 & echo "restful_pid=$!" for i in $(seq 1 180) do @@ -129,7 +129,7 @@ jobs: exit 1 - name: Test restful server - pytorch Qwen3-30B-A3B run: | - CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --enable-return-routed-experts --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_Qwen3-30B-A3B_start_restful.log 2>&1 & + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --enable-return-routed-experts --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_Qwen3-30B-A3B_start_restful.log 2>&1 & echo "restful_pid=$!" for i in $(seq 1 180) do @@ -149,7 +149,7 @@ jobs: exit 1 - name: Test restful server - pytorch Qwen3-VL-30B-A3B-Instruct run: | - CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-VL-30B-A3B-Instruct --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_Qwen3-VL-30B-A3B-Instruct_start_restful.log 2>&1 & + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-VL-30B-A3B-Instruct --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_Qwen3-VL-30B-A3B-Instruct_start_restful.log 2>&1 & echo "restful_pid=$!" for i in $(seq 1 180) do @@ -169,7 +169,7 @@ jobs: exit 1 - name: Test restful server - pytorch InternVL3_5-30B-A3B run: | - CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3_5-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_InternVL3_5-30B-A3B_start_restful.log 2>&1 & + CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3_5-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_InternVL3_5-30B-A3B_start_restful.log 2>&1 & echo "restful_pid=$!" for i in $(seq 1 180) do diff --git a/autotest/benchmark/test_apiserver_performance.py b/autotest/benchmark/test_apiserver_performance.py index 9b4947abfb..76cd8d593c 100644 --- a/autotest/benchmark/test_apiserver_performance.py +++ b/autotest/benchmark/test_apiserver_performance.py @@ -119,7 +119,7 @@ def test_pytorch_apiserver_tp16(config, run_config, worker_id): }, 'extra_params': {} }, { - 'model': 'Qwen/Qwen3-VL-32B-Instruct', + 'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct', 'backend': 'pytorch', 'communicator': 'nccl', 'quant_policy': 8, diff --git a/autotest/benchmark/test_prefixcache_performance.py b/autotest/benchmark/test_prefixcache_performance.py index fd8f4156be..05d51aaf75 100644 --- a/autotest/benchmark/test_prefixcache_performance.py +++ b/autotest/benchmark/test_prefixcache_performance.py @@ -101,7 +101,7 @@ def test_pytorch_prefix_tp16(config, run_config, worker_id): }, 'extra_params': {} }, { - 'model': 'Qwen/Qwen3-VL-32B-Instruct', + 'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct', 'backend': 'pytorch', 'communicator': 'nccl', 'quant_policy': 8, diff --git a/autotest/benchmark/test_throughput_performance.py b/autotest/benchmark/test_throughput_performance.py index e5f99c43da..72a98a6c80 100644 --- a/autotest/benchmark/test_throughput_performance.py +++ b/autotest/benchmark/test_throughput_performance.py @@ -102,7 +102,7 @@ def test_pytorch_throughput_tp16(config, run_config, worker_id): }, 'extra_params': {} }, { - 'model': 'Qwen/Qwen3-VL-32B-Instruct', + 'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct', 'backend': 'pytorch', 'communicator': 'nccl', 'quant_policy': 8, diff --git a/autotest/config.yml b/autotest/config.yml index 80060b6d04..b848f900dd 100644 --- a/autotest/config.yml +++ b/autotest/config.yml @@ -14,36 +14,16 @@ config: tp: meta-llama/Llama-4-Scout-17B-16E-Instruct: 4 meta-llama/Meta-Llama-3-1-70B-Instruct: 4 - internlm/Intern-S1: 8 - internlm/internlm2_5-20b-chat: 2 - internlm/internlm2_5-20b: 2 - internlm/internlm2_5-7b-chat-1m: 4 OpenGVLab/InternVL3-38B: 2 - OpenGVLab/InternVL2_5-26B: 2 - OpenGVLab/InternVL2_5-26B-MPO: 2 - OpenGVLab/InternVL2_5-38B: 4 - OpenGVLab/InternVL2-40B: 4 Qwen/Qwen3-235B-A22B: 8 - Qwen/Qwen3-32B: 2 Qwen/Qwen3-30B-A3B: 2 - Qwen/Qwen3-VL-32B-Instruct: 2 + Qwen/Qwen3-32B: 2 Qwen/Qwen3-VL-30B-A3B-Instruct: 2 Qwen/Qwen3-30B-A3B-Base: 2 - Qwen/Qwen2.5-32B-Instruct: 2 - Qwen/Qwen2.5-72B-Instruct: 4 Qwen/Qwen2.5-VL-32B-Instruct: 2 - deepseek-ai/DeepSeek-V2-Lite-Chat: 2 - deepseek-ai/DeepSeek-R1-Distill-Qwen-32B: 2 - deepseek-ai/deepseek-vl-1.3b-chat: 2 - baichuan-inc/Baichuan2-13B-Chat: 2 mistralai/Mixtral-8x7B-Instruct-v0.1: 2 - liuhaotian/llava-v1.5-13b: 2 - openbmb/MiniCPM-V-2_6: 2 - google/gemma-2-27b-it: 2 - OpenGVLab/InternVL2-Llama3-76B-AWQ: 4 - unsloth/gpt-oss-20b-BF16: 2 - unsloth/gpt-oss-120b-BF16: 4 OpenGVLab/InternVL3_5-30B-A3B: 2 + zai-org/GLM-4.7-Flash: 2 turbomind_chat_model: tp: @@ -53,61 +33,24 @@ turbomind_chat_model: - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ - meta-llama/Meta-Llama-3-1-70B-Instruct - meta-llama/Meta-Llama-3-8B-Instruct - - meta-llama/Llama-2-7b-chat-hf - - internlm/Intern-S1 - - internlm/Intern-S1-mini - internlm/internlm3-8b-instruct - internlm/internlm3-8b-instruct-awq - - internlm/internlm2_5-7b-chat - - internlm/internlm2_5-20b-chat - - OpenGVLab/InternVL3_5-30B-A3B - - OpenGVLab/InternVL3-2B - OpenGVLab/InternVL3-8B - OpenGVLab/InternVL3-38B - - OpenGVLab/InternVL2_5-26B-MPO - - OpenGVLab/InternVL2_5-1B - - OpenGVLab/InternVL2_5-8B - - OpenGVLab/InternVL2_5-26B - - OpenGVLab/InternVL2_5-38B - - OpenGVLab/InternVL2-2B - - OpenGVLab/InternVL2-40B - - OpenGVLab/Mini-InternVL-Chat-2B-V1-5 - - OpenGVLab/InternVL2-Llama3-76B-AWQ + - OpenGVLab/InternVL3_5-30B-A3B - Qwen/Qwen3-0.6B - Qwen/Qwen3-4B - Qwen/Qwen3-8B - Qwen/Qwen3-32B - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-GPTQ-Int4 - Qwen/Qwen3-235B-A22B - - Qwen/Qwen2.5-0.5B-Instruct - - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-32B-Instruct - - Qwen/Qwen2.5-72B-Instruct - - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4 - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct - - Qwen/Qwen2-VL-2B-Instruct - - Qwen/Qwen2-VL-7B-Instruct - Qwen/Qwen1.5-MoE-A2.7B-Chat - - mistralai/Mistral-7B-Instruct-v0.3 - - mistralai/Mistral-Nemo-Instruct-2407 - mistralai/Mixtral-8x7B-Instruct-v0.1 - - lmdeploy/llama2-chat-7b-w4 - - baichuan-inc/Baichuan2-7B-Chat - - 01-ai/Yi-6B-Chat - - liuhaotian/llava-v1.5-13b - - liuhaotian/llava-v1.6-vicuna-7b - - deepseek-ai/DeepSeek-R1-Distill-Llama-8B - - deepseek-ai/DeepSeek-R1-Distill-Qwen-32B - - deepseek-ai/deepseek-vl-1.3b-chat - - deepseek-ai/deepseek-coder-1.3b-instruct - - deepseek-ai/DeepSeek-V2-Lite-Chat - - codellama/CodeLlama-7b-Instruct-hf - THUDM/glm-4-9b-chat - - THUDM/codegeex4-all-9b - - openbmb/MiniCPM-Llama3-V-2_5 - - openbmb/MiniCPM-V-2_6 - # - allenai/Molmo-7B-D-0924 This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow` + - zai-org/GLM-4.7-Flash pytorch_chat_model: tp: @@ -117,26 +60,10 @@ pytorch_chat_model: - meta-llama/Meta-Llama-3-1-8B-Instruct - meta-llama/Meta-Llama-3-1-70B-Instruct - meta-llama/Meta-Llama-3-8B-Instruct - - meta-llama/Llama-2-7b-chat-hf - - internlm/Intern-S1 - - internlm/Intern-S1-mini - internlm/internlm3-8b-instruct - - internlm/internlm2_5-7b-chat - - internlm/internlm2_5-20b-chat - - OpenGVLab/InternVL3_5-30B-A3B - - OpenGVLab/InternVL3-2B - OpenGVLab/InternVL3-8B - OpenGVLab/InternVL3-38B - - OpenGVLab/InternVL2_5-26B-MPO - - OpenGVLab/InternVL2_5-1B - - OpenGVLab/InternVL2_5-8B - - OpenGVLab/InternVL2_5-26B - - OpenGVLab/InternVL2_5-38B - - OpenGVLab/InternVL2-2B - - OpenGVLab/InternVL2-4B - - OpenGVLab/InternVL2-40B - - OpenGVLab/InternVL2-Llama3-76B-AWQ - # - OpenGVLab/Mono-InternVL-2B 'dict' object has no attribute 'image_size' + - OpenGVLab/InternVL3_5-30B-A3B - Qwen/Qwen3-0.6B - Qwen/Qwen3-4B - Qwen/Qwen3-8B @@ -144,99 +71,32 @@ pytorch_chat_model: - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B - Qwen/Qwen3-VL-8B-Instruct - - Qwen/Qwen3-VL-32B-Instruct - Qwen/Qwen3-VL-30B-A3B-Instruct - - Qwen/Qwen2.5-0.5B-Instruct - - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-32B-Instruct - - Qwen/Qwen2.5-72B-Instruct - - Qwen/Qwen1.5-MoE-A2.7B-Chat - - Qwen/Qwen2.5-VL-7B-Instruct - - Qwen/Qwen2.5-VL-32B-Instruct - - Qwen/Qwen2-VL-2B-Instruct - - Qwen/Qwen2-VL-7B-Instruct - - unsloth/gpt-oss-20b-BF16 - - unsloth/gpt-oss-120b-BF16 - - mistralai/Mistral-7B-Instruct-v0.3 - - mistralai/Mixtral-8x7B-Instruct-v0.1 - - google/gemma-3-12b-it - - google/gemma-2-9b-it - - google/gemma-2-27b-it - - google/gemma-7b-it - - baichuan-inc/Baichuan2-7B-Chat - - baichuan-inc/Baichuan2-13B-Chat - - 01-ai/Yi-6B-Chat - - deepseek-ai/DeepSeek-R1-Distill-Llama-8B - - deepseek-ai/DeepSeek-R1-Distill-Qwen-32B - - deepseek-ai/deepseek-moe-16b-chat - - deepseek-ai/deepseek-coder-1.3b-instruct - - deepseek-ai/DeepSeek-V2-Lite-Chat - - THUDM/chatglm2-6b - THUDM/cogvlm2-llama3-chinese-chat-19B - THUDM/glm-4v-9b - THUDM/glm-4-9b-chat - - THUDM/codegeex4-all-9b - - openbmb/MiniCPM-V-2_6 - - microsoft/Phi-4-mini-instruct - - microsoft/Phi-3.5-mini-instruct + - google/gemma-2-9b-it + - google/gemma-2-27b-it + - zai-org/GLM-4.7-Flash - microsoft/Phi-3.5-vision-instruct - - microsoft/Phi-3-mini-4k-instruct - microsoft/Phi-3-vision-128k-instruct turbomind_vl_model: tp: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - - OpenGVLab/InternVL2_5-26B-MPO - - OpenGVLab/Mini-InternVL-Chat-2B-V1-5 - - OpenGVLab/InternVL3_5-30B-A3B - - OpenGVLab/InternVL3-2B - OpenGVLab/InternVL3-8B - OpenGVLab/InternVL3-38B - - OpenGVLab/InternVL2_5-1B - - OpenGVLab/InternVL2_5-8B - - OpenGVLab/InternVL2_5-26B - - OpenGVLab/InternVL2_5-38B - - OpenGVLab/InternVL2-2B - - OpenGVLab/InternVL2-40B - - OpenGVLab/InternVL2-Llama3-76B-AWQ + - OpenGVLab/InternVL3_5-30B-A3B - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct - - Qwen/Qwen2-VL-2B-Instruct - - Qwen/Qwen2-VL-7B-Instruct - - liuhaotian/llava-v1.5-13b - - liuhaotian/llava-v1.6-vicuna-7b - - deepseek-ai/deepseek-vl-1.3b-chat - - openbmb/MiniCPM-Llama3-V-2_5 - - openbmb/MiniCPM-V-2_6 pytorch_vl_model: tp: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - - OpenGVLab/InternVL2_5-26B-MPO - - OpenGVLab/Mini-InternVL-Chat-2B-V1-5 - - OpenGVLab/InternVL3_5-30B-A3B - - OpenGVLab/InternVL3-2B - OpenGVLab/InternVL3-8B - - OpenGVLab/InternVL3-38B - - OpenGVLab/InternVL2_5-1B - - OpenGVLab/InternVL2_5-8B - - OpenGVLab/InternVL2_5-26B - - OpenGVLab/InternVL2_5-38B - - OpenGVLab/InternVL2-2B - - OpenGVLab/InternVL2-4B - - OpenGVLab/InternVL2-40B - # - OpenGVLab/Mono-InternVL-2B 'dict' object has no attribute 'image_size' + - OpenGVLab/InternVL3_5-30B-A3B - Qwen/Qwen3-VL-8B-Instruct - - Qwen/Qwen3-VL-32B-Instruct - Qwen/Qwen3-VL-30B-A3B-Instruct - - Qwen/Qwen2-VL-2B-Instruct - - Qwen/Qwen2-VL-7B-Instruct - - Qwen/Qwen2.5-VL-7B-Instruct - - Qwen/Qwen2.5-VL-32B-Instruct - THUDM/cogvlm-chat-hf - # - THUDM/cogvlm2-llama3-chinese-chat-19B # 'HFChatTemplate' object has no attribute 'eoa' + - THUDM/cogvlm2-llama3-chinese-chat-19B - THUDM/glm-4v-9b - microsoft/Phi-3-vision-128k-instruct - microsoft/Phi-3.5-vision-instruct @@ -245,191 +105,104 @@ turbomind_base_model: tp: - Qwen/Qwen3-8B-Base - Qwen/Qwen3-30B-A3B-Base - - internlm/internlm2_5-7b - - internlm/internlm2_5-1_8b - - internlm/internlm2_5-20b - - codellama/CodeLlama-7b-hf pytorch_base_model: tp: - Qwen/Qwen3-8B-Base - Qwen/Qwen3-30B-A3B-Base - - internlm/internlm2_5-7b - - internlm/internlm2_5-1_8b - - internlm/internlm2_5-20b - - bigcode/starcoder2-7b turbomind_quantization: no_awq: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - meta-llama/Meta-Llama-3-1-70B-Instruct - # - internlm/internlm3-8b-instruct ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py) + - internlm/internlm3-8b-instruct # ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py) - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B - Qwen/Qwen3-30B-A3B-Base - - Qwen/Qwen3-VL-8B-Instruct - - Qwen/Qwen3-VL-32B-Instruct - - Qwen/Qwen3-VL-30B-A3B-Instruct - Qwen/Qwen1.5-MoE-A2.7B-Chat - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct - - Qwen/Qwen2-VL-2B-Instruct - - Qwen/Qwen2-VL-7B-Instruct - OpenGVLab/InternVL3_5-30B-A3B - - mistralai/Mistral-7B-Instruct-v0.3 - - mistralai/Mistral-Nemo-Instruct-2407 - - deepseek-ai/deepseek-coder-1.3b-instruct - - deepseek-ai/DeepSeek-V2-Lite-Chat - - codellama/CodeLlama-7b-Instruct-hf - # - allenai/Molmo-7B-D-0924 This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow` - - THUDM/codegeex4-all-9b + - zai-org/GLM-4.7-Flash gptq: - - internlm/internlm2_5-7b-chat + - empty no_kvint4: - meta-llama/Llama-3.2-1B-Instruct - - internlm/internlm2_5-1_8b - OpenGVLab/InternVL3-2B - OpenGVLab/InternVL3-8B - - OpenGVLab/InternVL2_5-1B - - openbmb/MiniCPM-V-2_6 - - Qwen/Qwen3-8B-Base - Qwen/Qwen3-0.6B - Qwen/Qwen3-4B - Qwen/Qwen3-8B - Qwen/Qwen3-32B - Qwen/Qwen3-30B-A3B + - Qwen/Qwen3-30B-A3B-GPTQ-Int4 - Qwen/Qwen3-235B-A22B - - Qwen/Qwen3-30B-A3B-Base - - Qwen/Qwen3-VL-8B-Instruct - - Qwen/Qwen3-VL-32B-Instruct - - Qwen/Qwen3-VL-30B-A3B-Instruct - - Qwen/Qwen2.5-0.5B-Instruct - - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-32B-Instruct - - Qwen/Qwen2.5-72B-Instruct - Qwen/Qwen2.5-VL-7B-Instruct - Qwen/Qwen2.5-VL-32B-Instruct - - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4 - - Qwen/Qwen2-VL-2B-Instruct - - Qwen/Qwen2-VL-7B-Instruct - Qwen/Qwen1.5-MoE-A2.7B-Chat - - microsoft/Phi-3.5-mini-instruct - # - allenai/Molmo-7B-D-0924 This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow` - - deepseek-ai/DeepSeek-V2-Lite-Chat + - Qwen/Qwen3-8B-Base + - Qwen/Qwen3-30B-A3B-Base + - zai-org/GLM-4.7-Flash no_kvint8: - deepseek-ai/DeepSeek-V2-Chat - - Qwen/Qwen2.5-7B-Instruct + - zai-org/GLM-4.7-Flash pytorch_quantization: awq: + - meta-llama/Llama-3.2-3B-Instruct - meta-llama/Meta-Llama-3-8B-Instruct - meta-llama/Meta-Llama-3-1-8B-Instruct - - meta-llama/Llama-2-7b-chat-hf # - internlm/internlm3-8b-instruct ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py) - - internlm/internlm2_5-7b-chat - - internlm/internlm2_5-20b-chat - - 01-ai/Yi-6B-Chat - Qwen/Qwen3-0.6B - Qwen/Qwen3-4B - Qwen/Qwen3-8B - - Qwen/Qwen3-32B - - Qwen/Qwen2.5-7B-Instruct - microsoft/Phi-3-mini-4k-instruct - # - microsoft/Phi-4-mini-instruct The size of tensor a (5120) must match the size of tensor b (3072) at non-singleton dimension 0 - THUDM/glm-4v-9b w8a8: - - meta-llama/Meta-Llama-3-8B-Instruct - meta-llama/Llama-3.2-1B-Instruct - - meta-llama/Llama-2-7b-chat-hf + - meta-llama/Meta-Llama-3-8B-Instruct + - meta-llama/Meta-Llama-3-1-8B-Instruct # - internlm/internlm3-8b-instruct ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py) - - internlm/internlm2_5-7b-chat - - internlm/internlm2_5-20b-chat - - 01-ai/Yi-6B-Chat - - mistralai/Mistral-7B-Instruct-v0.3 - - Qwen/Qwen2.5-7B-Instruct - microsoft/Phi-3-mini-4k-instruct - # - microsoft/Phi-4-mini-instruct The size of tensor a (5120) must match the size of tensor b (3072) at non-singleton dimension 0 - - internlm/internlm2_5-20b - - internlm/internlm2_5-7b - - meta-llama/Meta-Llama-3-1-8B-Instruct no_kvint4: - meta-llama/Llama-3.2-1B-Instruct - - internlm/internlm2_5-1_8b - OpenGVLab/InternVL3-2B - OpenGVLab/InternVL3-8B - - OpenGVLab/InternVL2-4B - - OpenGVLab/InternVL2_5-1B - Qwen/Qwen3-8B-Base + - Qwen/Qwen3-30B-A3B-Base - Qwen/Qwen3-0.6B - Qwen/Qwen3-4B - Qwen/Qwen3-8B - Qwen/Qwen3-32B - Qwen/Qwen3-30B-A3B - Qwen/Qwen3-235B-A22B - - Qwen/Qwen3-30B-A3B-Base - Qwen/Qwen3-VL-8B-Instruct - - Qwen/Qwen3-VL-32B-Instruct - Qwen/Qwen3-VL-30B-A3B-Instruct - - Qwen/Qwen2.5-0.5B-Instruct - - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-32B-Instruct - - Qwen/Qwen2.5-72B-Instruct - - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4 - - Qwen/Qwen1.5-MoE-A2.7B-Chat - - Qwen/Qwen2.5-VL-7B-Instruct - - Qwen/Qwen2.5-VL-32B-Instruct - - Qwen/Qwen2-VL-2B-Instruct - - Qwen/Qwen2-VL-7B-Instruct - - deepseek-ai/DeepSeek-V2-Lite-Chat - - microsoft/Phi-3-mini-4k-instruct - microsoft/Phi-3-vision-128k-instruct - microsoft/Phi-3.5-vision-instruct - - microsoft/Phi-3.5-mini-instruct - - openbmb/MiniCPM-V-2_6 - - unsloth/gpt-oss-20b-BF16 - - unsloth/gpt-oss-120b-BF16 + - zai-org/GLM-4.7-Flash no_kvint8: - - deepseek-ai/DeepSeek-V2-Lite-Chat + - zai-org/GLM-4.7-Flash longtext_benchmark_model: - Qwen/Qwen3-8B - - Qwen/Qwen3-32B - - Qwen/Qwen3-30B-A3B - - Qwen/Qwen3-235B-A22B - -benchmark_model: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - - meta-llama/Llama-2-7b-chat-hf - - meta-llama/Meta-Llama-3-1-8B-Instruct - - meta-llama/Meta-Llama-3-1-70B-Instruct - - internlm/internlm3-8b-instruct - - internlm/internlm2_5-7b-chat - - internlm/internlm2_5-20b-chat - - THUDM/glm-4-9b-chat - - Qwen/Qwen3-32B - Qwen/Qwen3-30B-A3B - - Qwen/Qwen3-235B-A22B - - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-72B-Instruct - - unsloth/gpt-oss-20b-BF16 - - unsloth/gpt-oss-120b-BF16 evaluate_model: - google/gemma-2-9b-it - google/gemma-2-27b-it - meta-llama/Meta-Llama-3-1-8B-Instruct - - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-32B-Instruct - Qwen/Qwen1.5-MoE-A2.7B-Chat - Qwen/Qwen3-30B-A3B +benchmark_model: + - meta-llama/Meta-Llama-3-1-8B-Instruct + - meta-llama/Meta-Llama-3-1-70B-Instruct + - internlm/internlm3-8b-instruct + - THUDM/glm-4-9b-chat + - Qwen/Qwen3-30B-A3B + mllm_evaluate_model: - - internlm/Intern-S1-mini - OpenGVLab/InternVL3-8B - Qwen/Qwen3-VL-8B-Instruct - - Qwen/Qwen3-VL-32B-Instruct - Qwen/Qwen3-VL-30B-A3B-Instruct - - internlm/Intern-S1 - OpenGVLab/InternVL3_5-30B-A3B diff --git a/autotest/config_3090.yml b/autotest/config_3090.yml index 20823f38e3..a393f95268 100644 --- a/autotest/config_3090.yml +++ b/autotest/config_3090.yml @@ -38,8 +38,6 @@ pytorch_chat_model: - Qwen/Qwen3-1.7B - Qwen/Qwen3-0.6B - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-VL-3B-Instruct - - Qwen/Qwen2.5-VL-7B-Instruct turbomind_vl_model: tp: @@ -54,8 +52,6 @@ pytorch_vl_model: - OpenGVLab/InternVL3-2B-Instruct - OpenGVLab/InternVL3-1B-Instruct - OpenGVLab/InternVL2_5-1B - - Qwen/Qwen2.5-VL-3B-Instruct - - Qwen/Qwen2.5-VL-7B-Instruct turbomind_base_model: tp: @@ -111,7 +107,5 @@ pytorch_quantization: - Qwen/Qwen3-1.7B - Qwen/Qwen3-0.6B - Qwen/Qwen2.5-7B-Instruct - - Qwen/Qwen2.5-VL-3B-Instruct - - Qwen/Qwen2.5-VL-7B-Instruct no_kvint8: - deepseek-ai/DeepSeek-V2-Lite-Chat diff --git a/autotest/config_3090_legacy.yml b/autotest/config_3090_legacy.yml new file mode 100644 index 0000000000..20823f38e3 --- /dev/null +++ b/autotest/config_3090_legacy.yml @@ -0,0 +1,117 @@ +model_path: /nvme/qa_test_models +resource_path: /nvme/qa_test_models/resource +log_path: /nvme/qa_test_models/autotest_log +server_log_path: /nvme/qa_test_models/server_log +eval_path: /nvme/qa_test_models/evaluation_report +mllm_eval_path: /nvme/qa_test_models/mllm_evaluation_report +benchmark_path: /nvme/qa_test_models/benchmark_report +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +env_tag: 3090 +device: cuda + +turbomind_chat_model: + tp: + - meta-llama/Llama-3.2-3B-Instruct + - meta-llama/Llama-3.2-1B-Instruct + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B + - Qwen/Qwen2.5-7B-Instruct + +pytorch_chat_model: + tp: + - meta-llama/Llama-3.2-3B-Instruct + - meta-llama/Llama-3.2-1B-Instruct + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-3B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + +turbomind_vl_model: + tp: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + +pytorch_vl_model: + tp: + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen2.5-VL-3B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + +turbomind_base_model: + tp: + - internlm/internlm3-8b-instruct + - Qwen/Qwen3-8B + +pytorch_base_model: + tp: + - internlm/internlm3-8b-instruct + - Qwen/Qwen3-8B + +turbomind_quantization: + no_awq: + - internlm/internlm3-8b-instruct + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL3-2B-Instruct + gptq: + - empty + no_kvint4: + - meta-llama/Llama-3.2-1B-Instruct + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-3B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + no_kvint8: + - deepseek-ai/DeepSeek-V2-Chat + +pytorch_quantization: + awq: + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B + - Qwen/Qwen2.5-7B-Instruct + w8a8: + - meta-llama/Llama-3.2-3B-Instruct + no_kvint4: + - OpenGVLab/InternVL3-8B + - meta-llama/Llama-3.2-1B-Instruct + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-8B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-3B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + no_kvint8: + - deepseek-ai/DeepSeek-V2-Lite-Chat diff --git a/autotest/config_5080.yml b/autotest/config_5080.yml index a0858c021a..9c3c459cba 100644 --- a/autotest/config_5080.yml +++ b/autotest/config_5080.yml @@ -30,7 +30,6 @@ pytorch_chat_model: - Qwen/Qwen3-4B - Qwen/Qwen3-1.7B - Qwen/Qwen3-0.6B - - Qwen/Qwen2.5-VL-3B-Instruct turbomind_vl_model: tp: @@ -43,7 +42,6 @@ pytorch_vl_model: - OpenGVLab/InternVL3-2B-Instruct - OpenGVLab/InternVL3-1B-Instruct - OpenGVLab/InternVL2_5-1B - - Qwen/Qwen2.5-VL-3B-Instruct turbomind_base_model: tp: @@ -87,6 +85,5 @@ pytorch_quantization: - Qwen/Qwen3-4B - Qwen/Qwen3-1.7B - Qwen/Qwen3-0.6B - - Qwen/Qwen2.5-VL-3B-Instruct no_kvint8: - deepseek-ai/DeepSeek-V2-Lite-Chat diff --git a/autotest/config_5080_legacy.yml b/autotest/config_5080_legacy.yml new file mode 100644 index 0000000000..9d700e4240 --- /dev/null +++ b/autotest/config_5080_legacy.yml @@ -0,0 +1,91 @@ +model_path: /nvme/qa_test_models +resource_path: /nvme/qa_test_models/resource +log_path: /nvme/qa_test_models/autotest_log +server_log_path: /nvme/qa_test_models/server_log +eval_path: /nvme/qa_test_models/evaluation_report +mllm_eval_path: /nvme/qa_test_models/mllm_evaluation_report +benchmark_path: /nvme/qa_test_models/benchmark_report +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +env_tag: 5080 +device: cuda + +turbomind_chat_model: + tp: + - meta-llama/Llama-3.2-3B-Instruct + - meta-llama/Llama-3.2-1B-Instruct + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B + +pytorch_chat_model: + tp: + - meta-llama/Llama-3.2-3B-Instruct + - meta-llama/Llama-3.2-1B-Instruct + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B + +turbomind_vl_model: + tp: + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + +pytorch_vl_model: + tp: + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen2.5-VL-3B-Instruct + +turbomind_base_model: + tp: + - Qwen/Qwen3-4B + +pytorch_base_model: + tp: + - Qwen/Qwen3-4B + +turbomind_quantization: + no_awq: + - OpenGVLab/InternVL3-2B-Instruct + gptq: + - empty + no_kvint4: + - meta-llama/Llama-3.2-1B-Instruct + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B + - Qwen/Qwen2.5-VL-3B-Instruct + no_kvint8: + - deepseek-ai/DeepSeek-V2-Chat + +pytorch_quantization: + awq: + - meta-llama/Llama-3.2-3B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B + w8a8: + - meta-llama/Llama-3.2-3B-Instruct + no_kvint4: + - meta-llama/Llama-3.2-1B-Instruct + - OpenGVLab/InternVL3-2B-Instruct + - OpenGVLab/InternVL3-1B-Instruct + - OpenGVLab/InternVL2_5-1B + - Qwen/Qwen3-4B + - Qwen/Qwen3-1.7B + - Qwen/Qwen3-0.6B + - Qwen/Qwen2.5-VL-3B-Instruct + no_kvint8: + - deepseek-ai/DeepSeek-V2-Lite-Chat diff --git a/autotest/config_h.yml b/autotest/config_h.yml index 0bf5710beb..667033f36c 100644 --- a/autotest/config_h.yml +++ b/autotest/config_h.yml @@ -12,6 +12,7 @@ device: cuda config: tp: + Qwen/Qwen3-235B-A22B-FP8: 4 internlm/Intern-S1: 4 Qwen/Qwen3-235B-A22B-Thinking-2507-FP8: 4 Qwen/Qwen3-30B-A3B: 2 @@ -24,6 +25,8 @@ config: JetLM/SDAR-30B-A3B-Sci: 2 moonshotai/Kimi-K2-Instruct-0905: 16 Qwen/Qwen3-235B-A22B-Thinking-2507: 8 + OpenGVLab/InternVL3_5-38B: 2 + Qwen/Qwen3-VL-30B-A3B-Instruct: 2 internlm/Intern-S1-Pro-FP8: 16 dp_ep: @@ -45,8 +48,6 @@ config: turbomind_chat_model: tp: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - Qwen/Qwen3-0.6B-FP8 - Qwen/Qwen3-1.7B-FP8 - Qwen/Qwen3-4B-FP8 @@ -58,6 +59,7 @@ turbomind_chat_model: - Qwen/Qwen3-30B-A3B-FP8 - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 + - OpenGVLab/InternVL3_5-38B - openai/gpt-oss-120b - openai/gpt-oss-20b @@ -66,8 +68,6 @@ turbomind_chat_model: pytorch_chat_model: tp: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - Qwen/Qwen3-0.6B-FP8 - Qwen/Qwen3-1.7B-FP8 - Qwen/Qwen3-4B-FP8 @@ -79,6 +79,8 @@ pytorch_chat_model: - Qwen/Qwen3-30B-A3B-FP8 - Qwen/Qwen3-32B - Qwen/Qwen3-32B-FP8 + - Qwen/Qwen3-VL-30B-A3B-Instruct + - OpenGVLab/InternVL3_5-38B - unsloth/gpt-oss-120b-BF16 - unsloth/gpt-oss-20b-BF16 - deepseek/DeepSeek-V3.1 @@ -92,17 +94,16 @@ pytorch_chat_model: turbomind_vl_model: tp: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - OpenGVLab/InternVL3_5-38B + pytorch_vl_model: tp: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - OpenGVLab/InternVL3_5-38B + - Qwen/Qwen3-VL-30B-A3B-Instruct turbomind_base_model: tp: - - internlm/Intern-S1-mini - Qwen/Qwen3-4B-FP8 - openai/gpt-oss-20b @@ -113,8 +114,6 @@ pytorch_base_model: turbomind_quantization: no_awq: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - Qwen/Qwen3-0.6B-FP8 - Qwen/Qwen3-1.7B-FP8 - Qwen/Qwen3-4B-FP8 @@ -131,8 +130,6 @@ turbomind_quantization: gptq: - empty no_kvint4: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - Qwen/Qwen3-0.6B-FP8 - Qwen/Qwen3-1.7B-FP8 - Qwen/Qwen3-4B-FP8 @@ -155,8 +152,6 @@ pytorch_quantization: w8a8: - empty no_kvint4: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - Qwen/Qwen3-8B-Base - Qwen/Qwen3-0.6B-FP8 - Qwen/Qwen3-1.7B-FP8 @@ -183,8 +178,6 @@ longtext_model: - Qwen/Qwen3-235B-A22B-Thinking-2507 benchmark_model: - - internlm/Intern-S1 - - internlm/Intern-S1-mini - meta-llama/Meta-Llama-3-1-8B-Instruct - meta-llama/Meta-Llama-3-1-70B-Instruct - Qwen/Qwen3-32B @@ -211,5 +204,5 @@ evaluate_model: - JetLM/SDAR-30B-A3B-Sci mllm_evaluate_model: - - internlm/Intern-S1 - - internlm/Intern-S1-mini + - OpenGVLab/InternVL3_5-38B + - Qwen/Qwen3-VL-30B-A3B-Instruct diff --git a/autotest/config_h_legacy.yml b/autotest/config_h_legacy.yml new file mode 100644 index 0000000000..02c9f9fcc6 --- /dev/null +++ b/autotest/config_h_legacy.yml @@ -0,0 +1,72 @@ +model_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/model +resource_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/resource +log_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/log +server_log_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/server_log +eval_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/evaluation_report +mllm_eval_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/mllm_evaluation_report +benchmark_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/benchmark_report +dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +prefix_dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/prefix_cache_test.json +env_tag: h +device: cuda + +config: + tp: + internlm/Intern-S1: 4 + +turbomind_chat_model: + tp: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + +pytorch_chat_model: + tp: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + +turbomind_vl_model: + tp: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + +pytorch_vl_model: + tp: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + +turbomind_base_model: + tp: + +pytorch_base_model: + tp: + +turbomind_quantization: + no_awq: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + gptq: + - empty + no_kvint4: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + no_kvint8: + - empty + +pytorch_quantization: + awq: + - empty + w8a8: + - empty + no_kvint4: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + no_kvint8: + - empty + +benchmark_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + +mllm_evaluate_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini diff --git a/autotest/config_legacy.yml b/autotest/config_legacy.yml new file mode 100644 index 0000000000..74abfcd690 --- /dev/null +++ b/autotest/config_legacy.yml @@ -0,0 +1,162 @@ +model_path: /nvme/qa_test_models +resource_path: /nvme/qa_test_models/resource +log_path: /nvme/qa_test_models/autotest_log +server_log_path: /nvme/qa_test_models/server_log +eval_path: /nvme/qa_test_models/evaluation_report +mllm_eval_path: /nvme/qa_test_models/mllm_evaluation_report +benchmark_path: /nvme/qa_test_models/benchmark_report +dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json +prefix_dataset_path: /nvme/qa_test_models/datasets/prefix_cache_test.json +env_tag: a100 +device: cuda + +config: + tp: + meta-llama/Llama-4-Scout-17B-16E-Instruct: 4 + meta-llama/Meta-Llama-3-1-70B-Instruct: 4 + internlm/Intern-S1: 8 + OpenGVLab/InternVL3-38B: 2 + OpenGVLab/InternVL2_5-26B: 2 + OpenGVLab/InternVL2_5-26B-MPO: 2 + OpenGVLab/InternVL2_5-38B: 4 + OpenGVLab/InternVL2-40B: 4 + Qwen/Qwen2.5-72B-Instruct: 4 + deepseek-ai/deepseek-vl-1.3b-chat: 2 + baichuan-inc/Baichuan2-13B-Chat: 2 + mistralai/Mixtral-8x7B-Instruct-v0.1: 2 + google/gemma-2-27b-it: 2 + OpenGVLab/InternVL2-Llama3-76B-AWQ: 4 + unsloth/gpt-oss-20b-BF16: 2 + unsloth/gpt-oss-120b-BF16: 4 + OpenGVLab/InternVL3_5-30B-A3B: 2 + +turbomind_chat_model: + tp: + - meta-llama/Llama-2-7b-chat-hf + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-8B + - OpenGVLab/Mini-InternVL-Chat-2B-V1-5 + - OpenGVLab/InternVL2-Llama3-76B-AWQ + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4 + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen2-VL-7B-Instruct + - baichuan-inc/Baichuan2-7B-Chat + - liuhaotian/llava-v1.6-vicuna-7b + - codellama/CodeLlama-7b-Instruct-hf + # - allenai/Molmo-7B-D-0924 This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow` + +pytorch_chat_model: + tp: + - meta-llama/Llama-2-7b-chat-hf + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-8B + # - OpenGVLab/Mono-InternVL-2B 'dict' object has no attribute 'image_size' + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen2-VL-7B-Instruct + - unsloth/gpt-oss-20b-BF16 + - mistralai/Mixtral-8x7B-Instruct-v0.1 + - google/gemma-3-12b-it + - google/gemma-2-9b-it + - google/gemma-2-27b-it + - google/gemma-7b-it + - baichuan-inc/Baichuan2-13B-Chat + - deepseek-ai/deepseek-moe-16b-chat + - THUDM/chatglm2-6b + - microsoft/Phi-4-mini-instruct + +turbomind_vl_model: + tp: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-8B + - OpenGVLab/Mini-InternVL-Chat-2B-V1-5 + - OpenGVLab/InternVL2-Llama3-76B-AWQ + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen2-VL-7B-Instruct + - liuhaotian/llava-v1.6-vicuna-7b + +pytorch_vl_model: + tp: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - OpenGVLab/InternVL3-8B + - OpenGVLab/InternVL2_5-8B + # - OpenGVLab/Mono-InternVL-2B 'dict' object has no attribute 'image_size' + - Qwen/Qwen2-VL-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + +turbomind_base_model: + tp: + - codellama/CodeLlama-7b-hf + +pytorch_base_model: + tp: + - bigcode/starcoder2-7b + +turbomind_quantization: + no_awq: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - OpenGVLab/InternVL3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4 + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen2-VL-7B-Instruct + - OpenGVLab/InternVL3_5-30B-A3B + - codellama/CodeLlama-7b-Instruct-hf + # - allenai/Molmo-7B-D-0924 This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow` + gptq: + - empty + no_kvint4: + - OpenGVLab/InternVL3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen2-VL-7B-Instruct + # - allenai/Molmo-7B-D-0924 This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow` + no_kvint8: + - Qwen/Qwen2.5-7B-Instruct + +pytorch_quantization: + awq: + - meta-llama/Llama-2-7b-chat-hf + # - internlm/internlm3-8b-instruct ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py) + - Qwen/Qwen2.5-7B-Instruct + # - microsoft/Phi-4-mini-instruct The size of tensor a (5120) must match the size of tensor b (3072) at non-singleton dimension 0 + w8a8: + - meta-llama/Llama-2-7b-chat-hf + # - internlm/internlm3-8b-instruct ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py) + - Qwen/Qwen2.5-7B-Instruct + # - microsoft/Phi-4-mini-instruct The size of tensor a (5120) must match the size of tensor b (3072) at non-singleton dimension 0 + no_kvint4: + - OpenGVLab/InternVL3-8B + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen2-VL-7B-Instruct + - microsoft/Phi-3-vision-128k-instruct + - microsoft/Phi-3.5-vision-instruct + - unsloth/gpt-oss-20b-BF16 + no_kvint8: + - empty + +longtext_benchmark_model: + - internlm/Intern-S1-mini + +benchmark_model: + - internlm/Intern-S1 + - internlm/Intern-S1-mini + - meta-llama/Llama-2-7b-chat-hf + - unsloth/gpt-oss-20b-BF16 + +evaluate_model: + - Qwen/Qwen2.5-7B-Instruct + +mllm_evaluate_model: + - internlm/Intern-S1-mini + - internlm/Intern-S1 diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index 7ea918415d..2b544634ce 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -353,7 +353,7 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name): pipe.close() -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_prompt(config, model, backend, worker_id): @@ -362,7 +362,7 @@ def test_return_with_prompt(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_prompt_stream(config, model, backend, worker_id): @@ -371,7 +371,7 @@ def test_return_with_prompt_stream(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_multi_prompt(config, model, backend, worker_id): @@ -380,7 +380,7 @@ def test_return_with_multi_prompt(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_multi_prompt_stream(config, model, backend, worker_id): @@ -389,7 +389,7 @@ def test_return_with_multi_prompt_stream(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_message(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -397,7 +397,7 @@ def test_return_with_message(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_message_stream(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -405,7 +405,7 @@ def test_return_with_message_stream(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_message_batch(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -413,7 +413,7 @@ def test_return_with_message_batch(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_return_with_message_batch_stream(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -421,7 +421,7 @@ def test_return_with_message_batch_stream(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig]) def test_return_check_logprobs(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -429,7 +429,7 @@ def test_return_check_logprobs(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig]) def test_return_check_logprobs_stream(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -437,7 +437,7 @@ def test_return_check_logprobs_stream(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_backend_config_session_len(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -445,7 +445,7 @@ def test_backend_config_session_len(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_min_new_tokens(config, model, backend, worker_id): file_name = f'pipeline_log_min_new_tokens_{worker_id}.txt' @@ -453,7 +453,7 @@ def test_gen_config_min_new_tokens(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_stop_words(config, model, backend, worker_id): file_name = f'pipeline_log_stop_words_{worker_id}.txt' @@ -461,7 +461,7 @@ def test_gen_config_stop_words(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_bad_words(config, model, backend, worker_id): file_name = f'pipeline_log_bad_words_{worker_id}.txt' @@ -469,7 +469,7 @@ def test_gen_config_bad_words(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_special_words_false(config, model, backend, worker_id): file_name = f'pipeline_log_special_words_{worker_id}.txt' @@ -477,7 +477,7 @@ def test_gen_config_special_words_false(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_special_words_true(config, model, backend, worker_id): file_name = f'pipeline_log_special_words_{worker_id}.txt' @@ -485,7 +485,7 @@ def test_gen_config_special_words_true(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_minimum_repetition_penalty(config, model, backend, worker_id): file_name = f'pipeline_log_repetition_penalty_{worker_id}.txt' @@ -493,7 +493,7 @@ def test_gen_config_minimum_repetition_penalty(config, model, backend, worker_id assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_repetition_penalty_bigger_than_1(config, model, backend, worker_id): file_name = f'pipeline_log_repetition_penalty_{worker_id}.txt' @@ -501,7 +501,7 @@ def test_gen_config_repetition_penalty_bigger_than_1(config, model, backend, wor assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_minimun_topp(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -509,7 +509,7 @@ def test_gen_config_minimun_topp(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_minimun_topk(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -517,7 +517,7 @@ def test_gen_config_minimun_topk(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_diff_random_seed(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -525,7 +525,7 @@ def test_gen_config_diff_random_seed(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_same_random_seed(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -533,7 +533,7 @@ def test_gen_config_same_random_seed(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_do_sample_batch(config, model, backend, worker_id): file_name = f'pipeline_log_{worker_id}.txt' @@ -541,7 +541,7 @@ def test_gen_config_do_sample_batch(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_max_new_tokens(config, model, backend, worker_id): file_name = f'pipeline_log_max_new_tokens_{worker_id}.txt' @@ -549,7 +549,7 @@ def test_gen_config_max_new_tokens(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_gen_config_ignore_eos(config, model, backend, worker_id): file_name = f'pipeline_log_ignore_eos_{worker_id}.txt' @@ -557,7 +557,7 @@ def test_gen_config_ignore_eos(config, model, backend, worker_id): assert_pipeline_common_log(config, file_name) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig]) def test_backend_config_input_validation(config, model, backend, worker_id): if 'gw' in worker_id: @@ -594,7 +594,7 @@ def test_backend_config_input_validation(config, model, backend, worker_id): unset_device_env_variable() -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig]) def test_backend_config_validate_turbomind(config, model, backend, worker_id): if 'gw' in worker_id: @@ -632,7 +632,7 @@ def test_backend_config_validate_turbomind(config, model, backend, worker_id): unset_device_env_variable() -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B']) @pytest.mark.parametrize('backend', [PytorchEngineConfig]) def test_backend_config_validate_pytorch(config, model, backend, worker_id): if 'gw' in worker_id: @@ -662,7 +662,7 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id): unset_device_env_variable() -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) +@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B']) @pytest.mark.parametrize('backend', [TurbomindEngineConfig]) def test_backend_config_tp(config, model, backend, worker_id): with pytest.raises(AssertionError): diff --git a/autotest/interface/pipeline/test_pipeline_longtext_func.py b/autotest/interface/pipeline/test_pipeline_longtext_func.py index a87c036814..45850ae1e9 100644 --- a/autotest/interface/pipeline/test_pipeline_longtext_func.py +++ b/autotest/interface/pipeline/test_pipeline_longtext_func.py @@ -13,14 +13,10 @@ SESSION_LEN_CONFIG = { 'Qwen/Qwen2.5-7B-Instruct': SESSION_LEN_32K, - 'Qwen/Qwen2.5-32B-Instruct': SESSION_LEN_32K, - 'Qwen/Qwen2.5-72B-Instruct': SESSION_LEN_32K, 'Qwen/Qwen3-235B-A22B': SESSION_LEN_128K, 'Qwen/Qwen3-30B-A3B': SESSION_LEN_128K, 'Qwen/Qwen3-32B': SESSION_LEN_128K, 'meta-llama/Meta-Llama-3-1-8B-Instruct': SESSION_LEN_128K, - 'internlm/Intern-S1-mini': SESSION_LEN_128K, - 'internlm/Intern-S1': SESSION_LEN_128K, 'meta-llama/Meta-Llama-3-1-70B-Instruct': SESSION_LEN_128K, } @@ -33,8 +29,7 @@ def run_case_in_spawn(target, args): @pytest.mark.gpu_num_1 -@pytest.mark.parametrize( - 'model', ['internlm/Intern-S1-mini', 'internlm/internlm2_5-7b-chat', 'internlm/internlm2_5-7b-chat-inner-4bits']) +@pytest.mark.parametrize('model', ['Qwen/Qwen3-8B']) def test_history_issue_tp1(config, model, worker_id): if 'gw' in worker_id: set_device_env_variable(worker_id) @@ -77,10 +72,7 @@ def stream_infer_worker(config, model, tp_num): @pytest.mark.gpu_num_1 -@pytest.mark.parametrize('model', [ - 'internlm/Intern-S1-mini', 'internlm/internlm2_5-7b-chat', 'internlm/internlm2_5-7b-chat-inner-4bits', - 'Qwen/Qwen2.5-7B-Instruct', 'meta-llama/Meta-Llama-3-1-8B-Instruct' -]) +@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct', 'meta-llama/Meta-Llama-3-1-8B-Instruct']) @pytest.mark.parametrize('backend', ['turbomind', 'pytorch']) def test_long_test_passkey_tp1(config, model, backend, worker_id): log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log']) @@ -93,7 +85,7 @@ def test_long_test_passkey_tp1(config, model, backend, worker_id): @pytest.mark.gpu_num_2 -@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-32B-Instruct', 'Qwen/Qwen3-30B-A3B', 'Qwen/Qwen3-32B']) +@pytest.mark.parametrize('model', ['Qwen/Qwen3-30B-A3B', 'Qwen/Qwen3-32B']) @pytest.mark.parametrize('backend', ['turbomind', 'pytorch']) def test_long_test_passkey_tp2(config, model, backend, worker_id): log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log']) @@ -106,23 +98,8 @@ def test_long_test_passkey_tp2(config, model, backend, worker_id): unset_device_env_variable() -@pytest.mark.gpu_num_4 -@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-72B-Instruct']) -@pytest.mark.parametrize('backend', ['turbomind', 'pytorch']) -def test_long_test_passkey_tp4(config, model, backend, worker_id): - log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log']) - if 'gw' in worker_id: - set_device_env_variable(worker_id, parallel_config=4) - os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500) - run_case_in_spawn(passkey_retrival_worker, - (config, model, backend, log_name, 4, SESSION_LEN_CONFIG.get(model, SESSION_LEN_128K))) - if 'gw' in worker_id: - unset_device_env_variable() - - @pytest.mark.gpu_num_8 -@pytest.mark.parametrize('model', - ['Qwen/Qwen3-235B-A22B', 'internlm/Intern-S1', 'meta-llama/Meta-Llama-3-1-70B-Instruct']) +@pytest.mark.parametrize('model', ['Qwen/Qwen3-235B-A22B', 'meta-llama/Meta-Llama-3-1-70B-Instruct']) @pytest.mark.parametrize('backend', ['turbomind', 'pytorch']) def test_long_test_passkey_tp8(config, model, backend, worker_id): log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log']) diff --git a/autotest/interface/restful/test_restful_generate.py b/autotest/interface/restful/test_restful_generate.py index cf4c9a463e..e08b5c3a92 100644 --- a/autotest/interface/restful/test_restful_generate.py +++ b/autotest/interface/restful/test_restful_generate.py @@ -4,7 +4,7 @@ import time from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime -from typing import Any, Dict, List +from typing import Any import pytest import requests @@ -115,8 +115,8 @@ def status_code(self): return resp def _validate_generation_response(self, - data: Dict[str, Any], - expected_fields: List[str] = None, + data: dict[str, Any], + expected_fields: list[str] | None = None, validate_tokens: bool = True, expect_logprobs: bool = False, validate_experts: bool = False) -> None: diff --git a/autotest/tools/common_case_config.py b/autotest/tools/common_case_config.py index 016bbf5e61..12334e8815 100644 --- a/autotest/tools/common_case_config.py +++ b/autotest/tools/common_case_config.py @@ -1,5 +1,5 @@ TURBOMIND_PR_TEST_LLM_GPU2 = [{ - 'model': 'internlm/internlm2_5-20b-chat', + 'model': 'Qwen/Qwen3-30B-A3B', 'backend': 'turbomind', 'communicator': 'cuda-ipc', 'quant_policy': 0, @@ -7,15 +7,6 @@ 'tp': 2 }, 'extra_params': {} -}, { - 'model': 'internlm/internlm2_5-20b-chat-inner-4bits', - 'backend': 'turbomind', - 'communicator': 'nccl', - 'quant_policy': 8, - 'parallel_config': { - 'tp': 2 - }, - 'extra_params': {} }, { 'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'backend': 'turbomind', @@ -28,7 +19,7 @@ }] TURBOMIND_PR_TEST_LLM_GPU1 = [{ - 'model': 'OpenGVLab/InternVL3-8B', + 'model': 'Qwen/Qwen3-0.6B', 'backend': 'turbomind', 'communicator': 'cuda-ipc', 'quant_policy': 0, @@ -37,7 +28,16 @@ }, 'extra_params': {} }, { - 'model': 'OpenGVLab/InternVL3-8B', + 'model': 'Qwen/Qwen3-0.6B-inner-4bits', + 'backend': 'turbomind', + 'communicator': 'nccl', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 1 + }, + 'extra_params': {} +}, { + 'model': 'Qwen/Qwen3-8B', 'backend': 'turbomind', 'communicator': 'nccl', 'quant_policy': 8, @@ -48,7 +48,7 @@ }] TURBOMIND_PR_TEST_MLLM_GPU1 = [{ - 'model': 'liuhaotian/llava-v1.6-vicuna-7b', + 'model': 'OpenGVLab/InternVL3-8B', 'backend': 'turbomind', 'communicator': 'cuda-ipc', 'quant_policy': 0, @@ -57,7 +57,7 @@ }, 'extra_params': {} }, { - 'model': 'OpenGVLab/InternVL2-4B', + 'model': 'OpenGVLab/InternVL3-8B', 'backend': 'turbomind', 'communicator': 'nccl', 'quant_policy': 8, @@ -65,19 +65,30 @@ 'tp': 1 }, 'extra_params': {} +}] + +TURBOMIND_PR_TEST_MLLM_GPU2 = [{ + 'model': 'OpenGVLab/InternVL3_5-30B-A3B', + 'backend': 'turbomind', + 'communicator': 'cuda-ipc', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 2 + }, + 'extra_params': {} }, { - 'model': 'OpenGVLab/InternVL3-8B', + 'model': 'OpenGVLab/InternVL3_5-30B-A3B', 'backend': 'turbomind', 'communicator': 'nccl', 'quant_policy': 8, 'parallel_config': { - 'tp': 1 + 'tp': 2 }, 'extra_params': {} }] TURBOMIND_FALLBACK_TEST_LLM_GPU1 = [{ - 'model': 'microsoft/Phi-4-mini-instruct', + 'model': 'THUDM/cogvlm-chat-hf', 'backend': 'turbomind', 'communicator': 'cuda-ipc', 'quant_policy': 8, @@ -85,10 +96,19 @@ 'tp': 1 }, 'extra_params': {} +}, { + 'model': 'microsoft/Phi-3.5-vision-instruct', + 'backend': 'turbomind', + 'communicator': 'nccl', + 'quant_policy': 0, + 'parallel_config': { + 'tp': 1 + }, + 'extra_params': {} }] TURBOMIND_FALLBACK_TEST_LLM_GPU2 = [{ - 'model': 'google/gemma-2-27b-it', + 'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct', 'backend': 'turbomind', 'communicator': 'cuda-ipc', 'quant_policy': 0, @@ -97,7 +117,7 @@ }, 'extra_params': {} }, { - 'model': 'deepseek-ai/deepseek-moe-16b-chat', + 'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct', 'backend': 'turbomind', 'communicator': 'nccl', 'quant_policy': 8, @@ -108,15 +128,6 @@ }] TURBOMIND_FALLBACK_TEST_MLLM_GPU1 = [{ - 'model': 'microsoft/Phi-4-mini-instruct', - 'backend': 'turbomind', - 'communicator': 'cuda-ipc', - 'quant_policy': 8, - 'parallel_config': { - 'tp': 1 - }, - 'extra_params': {} -}, { 'model': 'THUDM/glm-4v-9b', 'backend': 'turbomind', 'communicator': 'cuda-ipc', @@ -126,16 +137,7 @@ }, 'extra_params': {} }, { - 'model': 'THUDM/glm-4v-9b-inner-4bits', - 'backend': 'turbomind', - 'communicator': 'nccl', - 'quant_policy': 0, - 'parallel_config': { - 'tp': 1 - }, - 'extra_params': {} -}, { - 'model': 'OpenGVLab/InternVL2-4B', + 'model': 'THUDM/glm-4v-9b', 'backend': 'turbomind', 'communicator': 'nccl', 'quant_policy': 0, @@ -146,7 +148,7 @@ }] TURBOMIND_LOGPROBS_TEST_LLM_GPU2 = [{ - 'model': 'internlm/internlm2_5-20b-chat', + 'model': 'Qwen/Qwen3-30B-A3B', 'backend': 'turbomind', 'communicator': 'nccl', 'quant_policy': 0, @@ -227,7 +229,7 @@ }] PYTORCH_PR_TEST_LLM_GPU2 = [{ - 'model': 'internlm/internlm2_5-20b-chat', + 'model': 'Qwen/Qwen3-30B-A3B', 'backend': 'pytorch', 'communicator': 'nccl', 'quant_policy': 8, @@ -256,7 +258,7 @@ }, 'extra_params': {} }, { - 'model': 'OpenGVLab/InternVL3-8B', + 'model': 'Qwen/Qwen3-0.6B', 'backend': 'pytorch', 'communicator': 'nccl', 'quant_policy': 8, @@ -267,17 +269,7 @@ }] BASE_TOOLCALL_TEST_LLM = [{ - 'model': 'internlm/internlm2_5-7b-chat', - 'communicator': 'nccl', - 'quant_policy': 0, - 'parallel_config': { - 'tp': 1 - }, - 'extra_params': { - 'tool-call-parser': 'internlm' - } -}, { - 'model': 'Qwen/Qwen2.5-7B-Instruct', + 'model': 'Qwen/Qwen3-8B', 'communicator': 'nccl', 'quant_policy': 0, 'parallel_config': { @@ -286,16 +278,6 @@ 'extra_params': { 'tool-call-parser': 'qwen' } -}, { - 'model': 'internlm/internlm2_5-20b-chat', - 'communicator': 'nccl', - 'quant_policy': 0, - 'parallel_config': { - 'tp': 2 - }, - 'extra_params': { - 'tool-call-parser': 'internlm' - } }, { 'model': 'meta-llama/Meta-Llama-3-1-70B-Instruct', 'communicator': 'nccl', @@ -307,11 +289,11 @@ 'tool-call-parser': 'llama3' } }, { - 'model': 'Qwen/Qwen2.5-72B-Instruct', + 'model': 'Qwen/Qwen3-30B-A3B', 'communicator': 'nccl', 'quant_policy': 0, 'parallel_config': { - 'tp': 4 + 'tp': 2 }, 'extra_params': { 'tool-call-parser': 'qwen' @@ -319,24 +301,24 @@ }] BASE_REASONING_TEST_LLM = [{ - 'model': 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B', + 'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct', 'communicator': 'nccl', 'quant_policy': 0, 'parallel_config': { 'tp': 1 }, 'extra_params': { - 'reasoning-parser': 'deepseek-r1' + 'reasoning-parser': 'qwen-qwq' } }, { - 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', + 'model': 'Qwen/Qwen3-30B-A3B', 'communicator': 'nccl', 'quant_policy': 0, 'parallel_config': { 'tp': 2 }, 'extra_params': { - 'reasoning-parser': 'deepseek-r1' + 'reasoning-parser': 'qwen-qwq' } }] diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py index 3efe84d9e2..13ce7de514 100644 --- a/autotest/tools/pipeline/llm_case.py +++ b/autotest/tools/pipeline/llm_case.py @@ -63,7 +63,7 @@ def run_pipeline_chat_test(model_path, run_config, cases_path, is_pr_test: bool for case in cases_info.keys(): if is_pr_test and case != 'memory_test': continue - if case != 'code_testcases' and 'code' in model_path.lower(): + if case != 'code_testcase' and 'code' in model_path.lower(): continue case_info = cases_info.get(case) diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py index 4676a34341..bb146b8178 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py @@ -1,6 +1,7 @@ import pytest -from tools.common_case_config import TURBOMIND_FALLBACK_TEST_MLLM_GPU1, TURBOMIND_PR_TEST_MLLM_GPU1 -from utils.config_utils import get_func_config_list +from tools.common_case_config import (TURBOMIND_FALLBACK_TEST_MLLM_GPU1, TURBOMIND_PR_TEST_MLLM_GPU1, + TURBOMIND_PR_TEST_MLLM_GPU2) +from utils.config_utils import get_func_config_list, get_workerid from utils.pipeline_chat import run_pipeline_mllm_test BACKEND = 'turbomind' @@ -50,6 +51,17 @@ def test_restful_chat_fallback_backend_tp1(config, run_config, worker_id): @pytest.mark.gpu_num_1 @pytest.mark.other +@pytest.mark.pr_test @pytest.mark.parametrize('run_config', TURBOMIND_PR_TEST_MLLM_GPU1) def test_pipeline_pr_test(config, run_config, worker_id): + worker_id = 'gw' + str(6 + get_workerid(worker_id)) + run_pipeline_mllm_test(config, run_config, worker_id, is_smoke=True) + + +@pytest.mark.gpu_num_2 +@pytest.mark.other +@pytest.mark.pr_test +@pytest.mark.parametrize('run_config', TURBOMIND_PR_TEST_MLLM_GPU2) +def test_pipeline_pr_tp2_test(config, run_config, worker_id): + worker_id = 'gw' + str(3 + get_workerid(worker_id)) run_pipeline_mllm_test(config, run_config, worker_id, is_smoke=True) diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py index a92e0d5420..7a6bcb1e52 100644 --- a/autotest/tools/quantization/test_quantization_awq.py +++ b/autotest/tools/quantization/test_quantization_awq.py @@ -30,7 +30,7 @@ def test_quantization_gptq(config, model, worker_id): @pytest.mark.gpu_num_2 @pytest.mark.flaky(reruns=0) @pytest.mark.timeout(900) -@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat']) +@pytest.mark.parametrize('model', ['Qwen/Qwen3-0.6B']) def test_quantization_awq_pr(config, model): quantization_type = 'awq' quantization_all(config, model + '-inner-4bits', model, quantization_type, cuda_prefix='CUDA_VISIBLE_DEVICES=6') diff --git a/autotest/utils/common_utils.py b/autotest/utils/common_utils.py index 3a7fcd473f..f54c3aa489 100644 --- a/autotest/utils/common_utils.py +++ b/autotest/utils/common_utils.py @@ -1,14 +1,13 @@ import os import subprocess import sys -from typing import Tuple def execute_command_with_logging(cmd, log_file_path: str, timeout: int = 3600, env=None, - should_print=True) -> Tuple[bool, str]: + should_print=True) -> tuple[bool, str]: if env is None: env = os.environ.copy() diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 362a97ac67..3d71fe1e0d 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -1,7 +1,7 @@ import copy import os from collections import OrderedDict -from typing import Any, Dict, List, Optional +from typing import Any import yaml @@ -12,7 +12,7 @@ SUFFIX_INNER_W8A8 = '-inner-w8a8' -def resolve_extra_params(extra_params: Dict[str, Any], model_base_path: str) -> None: +def resolve_extra_params(extra_params: dict[str, Any], model_base_path: str) -> None: """Resolve relative model paths in extra_params to absolute paths. Centralised helper so that every call-site does not need its own @@ -37,10 +37,10 @@ def resolve_extra_params(extra_params: Dict[str, Any], model_base_path: str) -> def get_func_config_list(backend: str, - parallel_config: Dict[str, int], + parallel_config: dict[str, int], model_type: str = 'chat_model', func_type: str = 'func', - extra: Optional[Dict[str, Any]] = None) -> List[Dict]: + extra: dict[str, Any] | None = None) -> list[dict[str, Any]]: """Generate all valid running config combinations (communicator + quant policy + model). @@ -51,7 +51,7 @@ def get_func_config_list(backend: str, func_type: Test func type filter, default: func extra: extra config to update in each run config dict Returns: - List[Dict]: All valid run config dicts + list[dict]: All valid run config dicts """ config = get_config() device = config.get('device', 'cuda') @@ -105,6 +105,10 @@ def get_func_config_list(backend: str, if config.get('env_tag', '') in ['3090', '5080']: run_config['extra_params']['cache-max-entry-count'] = 0.5 + if config.get('env_tag', '') in ['a100'] and ('Qwen3-235B-A22B' in run_config['model'] + or run_config['model'] == 'internlm/Intern-S1'): + run_config['extra_params']['cache-max-entry-count'] = 0.6 + if 'sdar' in run_config['model'].lower(): run_config['extra_params']['dllm-block-length'] = 4 run_config['extra_params']['dllm-denoising-steps'] = 4 @@ -127,7 +131,7 @@ def get_func_config_list(backend: str, return run_configs -def get_cli_common_param(run_config: Dict[str, Any]) -> str: +def get_cli_common_param(run_config: dict[str, Any]) -> str: """Generate cli common params string by run config dict.""" backend = run_config.get('backend') model = run_config.get('model') @@ -162,7 +166,7 @@ def get_cli_common_param(run_config: Dict[str, Any]) -> str: return ' '.join(cli_params).strip() -def get_cli_str(config: Dict[str, Any]) -> str: +def get_cli_str(config: dict[str, Any]) -> str: cli_str = [] # Extra params for key, value in config.items(): @@ -181,7 +185,7 @@ def get_cli_str(config: Dict[str, Any]) -> str: return ' '.join(cli_str) -def get_parallel_config(config: Dict, model_name: str) -> List[Dict[str, int]]: +def get_parallel_config(config: dict[str, Any], model_name: str) -> list[dict[str, int]]: """Get matched parallel config dict by model name, default tp:1 if no match.""" result = [] @@ -201,23 +205,23 @@ def get_parallel_config(config: Dict, model_name: str) -> List[Dict[str, int]]: return result if result else [{'tp': 1}] -def _extract_models_from_config(config_value: Any) -> List[str]: +def _extract_models_from_config(config_value: Any) -> list[str]: """Extract flat model name list from config value (dict/list supported)""" models = [] - if isinstance(config_value, Dict): + if isinstance(config_value, dict): for model_list in config_value.values(): - if isinstance(model_list, List): + if isinstance(model_list, list): models.extend([m for m in model_list if isinstance(m, str)]) - elif isinstance(config_value, List): + elif isinstance(config_value, list): models.extend([m for m in config_value if isinstance(m, str)]) return models -def get_model_list(config: Dict, +def get_model_list(config: dict[str, Any], backend: str, - parallel_config: Dict[str, int] = None, + parallel_config: dict[str, int] | None = None, model_type: str = 'chat_model', - func_type: str = 'func') -> List[str]: + func_type: str = 'func') -> list[str]: """Get filtered model list with quantization extended models by backend/parallel config/model type/func type. @@ -228,7 +232,7 @@ def get_model_list(config: Dict, model_type: Model type, default: chat_model func_type: Test func type filter, default: func Returns: - List[str]: Base models + quantization extended models + list[str]: Base models + quantization extended models """ model_config_key = f'{backend}_{model_type}' all_models = [] @@ -252,7 +256,7 @@ def get_model_list(config: Dict, return extended_models -def _filter_by_test_func_type(config: Dict, model_list: List[str], func_type: str) -> List[str]: +def _filter_by_test_func_type(config: dict[str, Any], model_list: list[str], func_type: str) -> list[str]: """Filter model list by test function type, return intersection of two model sets.""" if func_type == 'func': @@ -266,7 +270,8 @@ def _filter_by_test_func_type(config: Dict, model_list: List[str], func_type: st return list(set(filtered_models) & set(model_list)) -def _extend_turbomind_quant_models(quant_config: dict, base_models: list, target_list: list) -> None: +def _extend_turbomind_quant_models(quant_config: dict[str, Any], base_models: list[str], + target_list: list[str]) -> None: """Append turbomind quantization models to target list (AWQ 4bits + GPTQ)""" no_awq_models = quant_config.get('no_awq', []) @@ -280,7 +285,7 @@ def _extend_turbomind_quant_models(quant_config: dict, base_models: list, target target_list.append(model_name + SUFFIX_INNER_GPTQ) -def _extend_pytorch_quant_models(quant_config: dict, base_models: list, target_list: list) -> None: +def _extend_pytorch_quant_models(quant_config: dict[str, Any], base_models: list[str], target_list: list[str]) -> None: """Append pytorch quantization models to target list (AWQ 4bits + W8A8)""" # Append AWQ quantization models for model_name in quant_config.get('awq', []): @@ -292,7 +297,7 @@ def _extend_pytorch_quant_models(quant_config: dict, base_models: list, target_l target_list.append(model_name + SUFFIX_INNER_W8A8) -def _is_kvint_model(config: Dict, backend: str, model: str, quant_policy: int) -> bool: +def _is_kvint_model(config: dict[str, Any], backend: str, model: str, quant_policy: int) -> bool: """Check if model supports the kv quantization policy, quant_policy=0 always return True.""" if quant_policy == 0: @@ -308,7 +313,7 @@ def _base_model_name(model: str) -> str: return model.replace('-inner-4bits', '').replace('-inner-w8a8', '').replace('-inner-gptq', '') -def get_quantization_model_list(type: str) -> List[str]: +def get_quantization_model_list(type: str) -> list[str]: """Get quantization model list by specified quant type(awq/gptq/w8a8)""" config = get_config() quant_model_list = [] @@ -340,7 +345,7 @@ def get_quantization_model_list(type: str) -> List[str]: return quant_model_list -def get_config() -> Dict[str, Any]: +def get_config() -> dict[str, Any]: """Load & get yaml config file, auto adapt device env & update log path.""" # Get device env & match config file path env_tag = os.environ.get('TEST_ENV') @@ -370,7 +375,7 @@ def get_config() -> Dict[str, Any]: return config_copy -def get_cuda_prefix_by_workerid(worker_id: Optional[str], parallel_config: Dict[str, int] = None) -> Optional[str]: +def get_cuda_prefix_by_workerid(worker_id: str | None, parallel_config: dict[str, int] | None = None) -> str | None: """Get cuda/ascend visible devices env prefix by worker id & parallel config.""" para_conf = parallel_config or {} @@ -387,7 +392,7 @@ def get_cuda_prefix_by_workerid(worker_id: Optional[str], parallel_config: Dict[ return f'ASCEND_RT_VISIBLE_DEVICES={cuda_id}' if device_type == 'ascend' else f'CUDA_VISIBLE_DEVICES={cuda_id}' -def get_cuda_id_by_workerid(worker_id: Optional[str], tp_num: int = 1) -> Optional[str]: +def get_cuda_id_by_workerid(worker_id: str | None, tp_num: int = 1) -> str | None: """Get cuda id str by worker id and tp num, return None if invalid worker id.""" if worker_id is None or 'gw' not in worker_id: @@ -398,7 +403,7 @@ def get_cuda_id_by_workerid(worker_id: Optional[str], tp_num: int = 1) -> Option return ','.join([str(cuda_num + i) for i in range(tp_num)]) -def get_workerid(worker_id: Optional[str]) -> int: +def get_workerid(worker_id: str | None) -> int: """Parse numeric worker id from worker id str, return 0 if invalid worker id.""" if worker_id is None or 'gw' not in worker_id: @@ -413,7 +418,9 @@ def is_quantization_model(model: str) -> bool: return any(key in lower_name for key in ('awq', '4bits', 'w4', 'int4')) -def _get_communicator_list(config: Dict, backend: str, parallel_config: Dict[str, int] = None) -> List[str]: +def _get_communicator_list(config: dict[str, Any], + backend: str, + parallel_config: dict[str, int] | None = None) -> list[str]: """Get available communicator list by device and parallel config.""" device = config.get('device', None) @@ -429,7 +436,7 @@ def _get_communicator_list(config: Dict, backend: str, parallel_config: Dict[str return ['nccl', 'cuda-ipc'] -def set_device_env_variable(worker_id, parallel_config: Dict[str, int] = None): +def set_device_env_variable(worker_id: str | None, parallel_config: dict[str, int] | None = None) -> None: """Set device environment variable based on the device type.""" device = os.environ.get('DEVICE', 'cuda') @@ -460,13 +467,13 @@ def unset_device_env_variable(): del os.environ['CUDA_VISIBLE_DEVICES'] -def is_model_in_list(config: Dict, parallel_config: Dict[str, int], model: str) -> bool: +def is_model_in_list(config: dict[str, Any], parallel_config: dict[str, int], model: str) -> bool: """Check if model matches the target parallel config.""" model_config = get_parallel_config(config, model) return parallel_config in model_config -def get_case_str_by_config(run_config: Dict[str, Any], is_simple: bool = True) -> str: +def get_case_str_by_config(run_config: dict[str, Any], is_simple: bool = True) -> str: """Generate case name string by run config dict.""" model_name = run_config['model'] backend_type = run_config['backend'] @@ -491,7 +498,7 @@ def get_case_str_by_config(run_config: Dict[str, Any], is_simple: bool = True) - return f'{backend_type}_{pure_model_name}_{communicator}_{parallel_str}_{quant_policy}{extra_params_case}' -def parse_config_by_case(case_str: str) -> Dict[str, Any]: +def parse_config_by_case(case_str: str) -> dict[str, Any]: """Parse run config dict from case name string (fix split & type convert bug)""" case_parts = case_str.split('_') diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py index 0a3e78a018..153d3220e7 100644 --- a/autotest/utils/constant.py +++ b/autotest/utils/constant.py @@ -153,9 +153,9 @@ RESTFUL_MODEL_LIST = [ 'Qwen/Qwen3-0.6B', 'Qwen/Qwen3-VL-2B-Instruct', 'Qwen/Qwen3-30B-A3B', 'internlm/Intern-S1', - 'internlm/internlm2_5-20b-chat', 'internlm/internlm2_5-20b', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', - 'OpenGVLab/InternVL3-38B', 'Qwen/Qwen3-VL-8B-Instruct', 'internlm/internlm3-8b-instruct', - 'meta-llama/Llama-3.2-3B-Instruct', 'Qwen/Qwen3-VL-30B-A3B-Instruct' + 'internlm/internlm2_5-20b', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', 'OpenGVLab/InternVL3-38B', + 'Qwen/Qwen3-VL-8B-Instruct', 'internlm/internlm3-8b-instruct', 'meta-llama/Llama-3.2-3B-Instruct', + 'Qwen/Qwen3-VL-30B-A3B-Instruct' ] RESTFUL_BASE_MODEL_LIST = [ diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index f3e6694840..61576c841a 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -50,7 +50,7 @@ def run_pipeline_llm_test(config, run_config, common_case_config, worker_id: str for case in common_case_config.keys(): if is_smoke and case != 'memory_test': continue - if case != 'code_testcases' and 'code' in model_path.lower(): + if case != 'code_testcase' and 'code' in model_path.lower(): continue with allure.step(case): diff --git a/autotest/utils/proxy_distributed_utils.py b/autotest/utils/proxy_distributed_utils.py index dc4efdebad..0472af3953 100644 --- a/autotest/utils/proxy_distributed_utils.py +++ b/autotest/utils/proxy_distributed_utils.py @@ -3,7 +3,7 @@ import socket import subprocess import time -from typing import Any, Dict, Tuple +from typing import Any import requests from utils.config_utils import get_case_str_by_config, get_cli_common_param, resolve_extra_params @@ -28,7 +28,7 @@ def is_port_open(host: str, port: int, timeout: float = 1.0) -> bool: def check_nodes_status(host: str, proxy_port: int, model_name: str, expected_instances: int, check_count: int, current_time: float, last_progress_print: float, - progress_print_interval: int) -> Tuple[bool, int]: + progress_print_interval: int) -> tuple[bool, int]: try: nodes_url = f'http://{host}:{proxy_port}/nodes/status' resp = requests.get(nodes_url, timeout=10) @@ -215,7 +215,7 @@ def cleanup(self): class ApiServerPerTest: - def __init__(self, proxy_manager: ProxyDistributedManager, config: Dict[str, Any], run_config: Dict[str, Any]): + def __init__(self, proxy_manager: ProxyDistributedManager, config: dict[str, Any], run_config: dict[str, Any]): self.proxy_manager = proxy_manager self.config = config self.run_config = run_config diff --git a/autotest/utils/ray_distributed_utils.py b/autotest/utils/ray_distributed_utils.py index 2b87a4bb41..919745632a 100644 --- a/autotest/utils/ray_distributed_utils.py +++ b/autotest/utils/ray_distributed_utils.py @@ -4,7 +4,7 @@ import subprocess import time from time import time as time_time -from typing import Any, Dict +from typing import Any import requests from utils.config_utils import get_case_str_by_config, get_cli_common_param, resolve_extra_params @@ -150,7 +150,7 @@ def start_ray_cluster(self): print(f'💥 Ray startup failed: {e.stderr}') raise - def start_lmdeploy_api_server(self, config: dict, run_config: dict): + def start_lmdeploy_api_server(self, config: dict[str, Any], run_config: dict[str, Any]) -> None: """ Master node: Start LMDeploy API Server and wait for it to be ready. Worker nodes: Do not start the service, only verify that the master node's API Server is ready. @@ -252,7 +252,7 @@ def cleanup(self, force: bool = True): print(f'⚠️ Ray stop exception: {e}') self._cleaned = True # Only mark as "fully cleaned" when force=True - def get_cluster_info(self) -> Dict[str, Any]: + def get_cluster_info(self) -> dict[str, Any]: return { 'node_rank': self.node_rank, 'node_count': self.node_count, diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 13192d37c5..7cb93166a8 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -123,6 +123,8 @@ def run_all_step(log_path, case_name, cases_info, port: int = DEFAULT_PORT): if model is None: assert False, 'server not start correctly' for case in cases_info.keys(): + if case != 'code_testcase' and 'code' in model.lower(): + continue case_info = cases_info.get(case) with allure.step(case + ' restful_test - openai chat'): @@ -153,17 +155,34 @@ def open_chat_test(log_path, case_name, case_info, url): messages.append({'role': 'user', 'content': prompt}) file.writelines('prompt:' + prompt + '\n') - response = client.chat.completions.create(model=model_name, - messages=messages, - temperature=0.01, - top_p=0.8, - max_completion_tokens=1024) - - output_content = response.choices[0].message.content - file.writelines('output:' + output_content + '\n') + outputs = client.chat.completions.create(model=model_name, + messages=messages, + temperature=0.01, + top_p=0.8, + max_completion_tokens=1024, + stream=True) + + content_chunks = [] + reasoning_content_chunks = [] + for output in outputs: + # Safely handle streaming chunks: choices may be empty and content may be None + if not getattr(output, 'choices', None): + continue + choice = output.choices[0] + delta = getattr(choice, 'delta', None) + reasoning_content = getattr(delta, 'reasoning_content', None) if delta is not None else None + content = getattr(delta, 'content', None) if delta is not None else None + if reasoning_content: + reasoning_content_chunks.append(reasoning_content) + if content: + content_chunks.append(content) + reasoning_content = ''.join(reasoning_content_chunks) + output_content = ''.join(content_chunks) + + file.writelines(f'reasoning_content :{reasoning_content}, content: {output_content}\n') messages.append({'role': 'assistant', 'content': output_content}) - case_result, reason = assert_result(output_content, prompt_detail.values(), model_name) + case_result, reason = assert_result(reasoning_content + output_content, prompt_detail.values(), model_name) file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n') if not case_result: msg += reason diff --git a/autotest/utils/toolkit.py b/autotest/utils/toolkit.py index 7341c9d044..28078c1336 100644 --- a/autotest/utils/toolkit.py +++ b/autotest/utils/toolkit.py @@ -1,10 +1,9 @@ from functools import lru_cache -from typing import List from transformers import AutoTokenizer -def parse_sse_stream(content: str) -> list: +def parse_sse_stream(content: str) -> list[str]: """Parse SSE (Server-Sent Events) stream content into a list of events. Each event is either a JSON string or "[DONE]". @@ -31,7 +30,7 @@ def _load_tokenizer_cached(model_path: str): raise RuntimeError(f"Failed to load tokenizer from '{model_path}': {e}") -def encode_text(model_path: str, text: str) -> List[int]: +def encode_text(model_path: str, text: str) -> list[int]: tokenizer = _load_tokenizer_cached(model_path) encoded = tokenizer.encode(text)