diff --git a/.github/workflows/api_eval.yml b/.github/workflows/api_eval.yml
index f8662c699b..ca4ce83467 100644
--- a/.github/workflows/api_eval.yml
+++ b/.github/workflows/api_eval.yml
@@ -32,7 +32,11 @@ on:
         description: 'Set custom run ID. If not provided, github.run_id will be used'
         type: string
         default: ''
-
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
@@ -40,11 +44,8 @@ env:
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }}
   COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
-  FAIL_CONFIG: '--lf'
   TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }}
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
-  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
-  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
   COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache
   HF_DATASETS_OFFLINE: 1
   HF_DATASETS_CACHE: /nvme/qa_test_models/hf_datasets
@@ -54,7 +55,7 @@ env:
 
 jobs:
   linux-build:
-    if: ${{ !cancelled() }}
+    if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
     strategy:
       matrix:
         pyver: [py310]
diff --git a/.github/workflows/api_eval_h800.yml b/.github/workflows/api_eval_legacy.yml
similarity index 57%
rename from .github/workflows/api_eval_h800.yml
rename to .github/workflows/api_eval_legacy.yml
index dc5678927f..e85729378c 100644
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_legacy.yml
@@ -1,4 +1,4 @@
-name: api_eval_h800
+name: api_eval_legacy
 
 on:
   workflow_dispatch:
@@ -32,31 +32,32 @@ on:
         description: 'Set custom run ID. If not provided, github.run_id will be used'
         type: string
         default: ''
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
 
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
-  OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }}
   COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
-  FAIL_CONFIG: '--lf'
   TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }}
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
-  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
-  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
   COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache
   HF_DATASETS_OFFLINE: 1
   HF_DATASETS_CACHE: /nvme/qa_test_models/hf_datasets
   HF_HUB_OFFLINE: 1
   HF_EVALUATE_OFFLINE: 1
   RUN_ID: ${{ inputs.repo_ref }}_${{ github.run_id }}
-  TEST_ENV: h800
+  TEST_ENV: legacy
 
 jobs:
   linux-build:
-    if: ${{ !cancelled() }}
+    if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
     strategy:
       matrix:
         pyver: [py310]
@@ -67,8 +68,20 @@ jobs:
       DOCKER_TAG: cuda12.8
       OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
     steps:
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
       - name: Checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v6
         with:
           repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
           ref: ${{github.event.inputs.repo_ref || 'main'}}
@@ -90,56 +103,95 @@ jobs:
           retention-days: 1
           name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
 
-  test_evaluation:
+
+  download_pkgs:
     needs: linux-build
-    if: ${{ !cancelled() }}
-    runs-on: [self-hosted, h800-r1]
-    timeout-minutes: 2400
-    strategy:
-      fail-fast: false
-      matrix:
-        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, linux-a100]
+    timeout-minutes: 50
     container:
-      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
+      image: openmmlab/lmdeploy:latest-cu12.8
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
       volumes:
-        - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/packages:/root/packages
-        - /nvme/github-actions/resources:/root/resources
-        - /nvme/github-actions/opencompass-data:/root/opencompass-data
         - /nvme/qa_test_models:/nvme/qa_test_models
-        - /nvme1/qa_test_models:/nvme1/qa_test_models
-        - /nvme2/share:/nvme2/share
-        - /mnt/158_nvme2:/mnt/158_nvme2
-        - /mnt/158_nvme3:/mnt/158_nvme3
-        - /mnt/158_nvme4:/mnt/158_nvme4
+        - /mnt/121:/mnt/121
+        - /mnt/104:/mnt/104
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
-      - name: Create and change to _wk directory
-        run: |
-          echo "Working directory set to: $(pwd)"
       - name: Clone repository
         uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         with:
           repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
           ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
       - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         uses: actions/download-artifact@v4
         with:
           name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Mark as start
+        run: |
+          chmod -R 777 ${{env.TEST_CODE_PATH}}
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+
+  test_evaluation:
+    needs: download_pkgs
+    if: ${{ !cancelled() }}
+    runs-on: [self-hosted, linux-a100]
+    timeout-minutes: 7200
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        gpu_num: ['gpu_num_1', 'gpu_num_2', 'gpu_num_4', 'gpu_num_8']
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources:/root/resources
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme/huggingface_hub:/nvme/huggingface_hub
+        - /mnt/121:/mnt/121
+        - /mnt/104:/mnt/104
+        - /mnt/bigdisk:/mnt/bigdisk
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
       - name: Install lmdeploy - dependency
         run: |
-          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
       - name: Install lmdeploy
         run: |
-          python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
       - name: Install opencompass
         run: |
-          python3 -m pip install opencompass
+          git clone https://github.com/open-compass/opencompass.git --depth 1
+          cd opencompass
+          python3 -m pip install .
           python3 -m pip install langdetect
       - name: Check env
         run: |
+          pip install transformers==4.57.6
           python3 -m pip list
           lmdeploy check_env
           mkdir ${{env.REPORT_DIR}} -p
@@ -148,17 +200,15 @@ jobs:
         if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
         run: |
           overall_exit=0
-          ln -s /nvme/qa_test_models/resource/opencompass-data/data ./data
+          ln -s /mnt/104/opencompass-data/data ./data
           ln -s /nvme/qa_test_models/resource/nltk_data /usr/share/nltk_data
           execution_mode="${{ github.event.inputs.execution_mode || 'both' }}"
+          ulimit -n 65535
           if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then
-            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_1 and ${{matrix.backend}} and infer" -n 8 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_2 and ${{matrix.backend}} and infer" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_4 and ${{matrix.backend}} and infer" -n 2 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
-            pytest autotest/evaluate/test_api_evaluate.py -m "gpu_num_8 and ${{matrix.backend}} and infer" -n 1 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and infer" --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
           fi
           if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then
-            pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.backend}} and eval" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+            pytest autotest/evaluate/test_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and eval" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
           fi
           exit $overall_exit
       - name: Clear workspace
diff --git a/.github/workflows/benchmark_legacy.yml b/.github/workflows/benchmark_legacy.yml
new file mode 100644
index 0000000000..68d7eafb25
--- /dev/null
+++ b/.github/workflows/benchmark_legacy.yml
@@ -0,0 +1,204 @@
+name: benchmark_test_legacy
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      benchmark_type:
+        required: true
+        description: 'Set benchmark type. Default is "["longtext", "throughput", "api_server", "prefixcache"]"'
+        type: string
+        default: "['apiserver', 'mllm_apiserver', 'throughput', 'longtext', 'prefixcache']"
+      backend:
+        required: true
+        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch']"
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
+  REPORT_DIR: /nvme/qa_test_models/benchmark_report/${{ inputs.repo_ref }}_${{ github.run_id }}
+  ALLURE_REPORT_DIR: /nvme/qa_test_models/benchmark_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }}
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }}
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  RUN_ID: ${{ inputs.repo_ref }}_${{ github.run_id }}
+  TEST_ENV: legacy
+
+jobs:
+  linux-build:
+    if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.8
+    steps:
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+      - name: Checkout repository
+        uses: actions/checkout@v6
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+  download_pkgs:
+    needs: linux-build
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, linux-a100]
+    timeout-minutes: 50
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/121:/mnt/121
+        - /mnt/104:/mnt/104
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Mark as start
+        run: |
+          chmod -R 777 ${{env.TEST_CODE_PATH}}
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+
+  benchmark:
+    needs: download_pkgs
+    if: ${{github.event_name == 'schedule' || !cancelled()}}
+    runs-on: [self-hosted, linux-a100]
+    strategy:
+      fail-fast: false
+      matrix:
+        benchmark_type: ${{fromJSON(github.event.inputs.benchmark_type)}}
+        gpu_num: ['gpu_num_1', 'gpu_num_2', 'gpu_num_4', 'gpu_num_8']
+        include:
+          - n: 8
+            gpu_num: gpu_num_1
+          - n: 4
+            gpu_num: gpu_num_2
+          - n: 2
+            gpu_num: gpu_num_4
+          - n: 1
+            gpu_num: gpu_num_8
+    timeout-minutes: 480
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme/huggingface_hub:/nvme/huggingface_hub
+        - /mnt/121:/mnt/121
+        - /mnt/104:/mnt/104
+        - /mnt/bigdisk:/mnt/bigdisk
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install transformers==4.57.6
+          python3 -m pip list
+          lmdeploy check_env
+      - name: Run other benchmark - all
+        if: contains(fromJson(github.event.inputs.backend), 'turbomind') && contains(fromJson(github.event.inputs.backend), 'pytorch')
+        run: |
+            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n ${{matrix.n}} -m '${{matrix.gpu_num}} and not pr_test and not function' --alluredir=${{env.ALLURE_REPORT_DIR}}
+      - name: Run other benchmark - turbomind
+        if: contains(fromJson(github.event.inputs.backend), 'turbomind') && !contains(fromJson(github.event.inputs.backend), 'pytorch')
+        run: |
+            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n ${{matrix.n}} -m '${{matrix.gpu_num}} and not pr_test and not function and turbomind' --alluredir=${{env.ALLURE_REPORT_DIR}}
+      - name: Run other benchmark - pytorch
+        if: contains(fromJson(github.event.inputs.backend), 'pytorch') && !contains(fromJson(github.event.inputs.backend), 'turbomind')
+        run: |
+            pytest autotest/benchmark/test_${{matrix.benchmark_type}}_performance.py -n ${{matrix.n}} -m '${{matrix.gpu_num}} and not pr_test and not function and pytorch' --alluredir=${{env.ALLURE_REPORT_DIR}}
+      - name: Clear workfile
+        if: always()
+        run: |
+          echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 525e9f4bea..18802033a9 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -48,7 +48,6 @@ env:
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
   COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
-  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
   TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
@@ -183,6 +182,7 @@ jobs:
           rm -rf ${{env.DEEPSEEK_VL}}/build
       - name: Check env
         run: |
+          pip install transformers==4.57.6
           python3 -m pip list
           lmdeploy check_env
           rm -rf allure-results
@@ -334,28 +334,13 @@ jobs:
       fail-fast: false
       matrix:
         backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
-        model_path: ['internlm/Intern-S1', 'internlm/internlm2_5-20b-chat', 'internlm/internlm2_5-20b', 'Qwen/Qwen3-8B-Base', 'Qwen/Qwen3-30B-A3B', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', 'OpenGVLab/InternVL3-38B', 'Qwen/Qwen3-VL-8B-Instruct', 'Qwen/Qwen3-VL-30B-A3B-Instruct']
+        model_path: ['Qwen/Qwen3-8B-Base', 'Qwen/Qwen3-30B-A3B', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', 'OpenGVLab/InternVL3-38B', 'Qwen/Qwen3-VL-8B-Instruct', 'Qwen/Qwen3-VL-30B-A3B-Instruct']
         include:
-          - tp: 2
-            model: internlm2_5-20b-chat
-            model_path: internlm/internlm2_5-20b-chat
-            case_info: ['chat_completions_v1', 'generate']
-            generate_type: base
-          - tp: 2
-            model: internlm2_5-20b
-            model_path: internlm/internlm2_5-20b
-            case_info: ['completions_v1']
-            generate_type: base
           - tp: 2
             model: Qwen3-8B-Base
             model_path: Qwen/Qwen3-8B-Base
             case_info: ['completions_v1']
             generate_type: base
-          - tp: 8
-            model: Intern-S1
-            model_path: internlm/Intern-S1
-            case_info: ['chat_completions_v1', 'generate']
-            generate_type: base
           - tp: 2
             model: Qwen3-30B-A3B
             model_path: Qwen/Qwen3-30B-A3B
@@ -644,7 +629,7 @@ jobs:
           ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
       - name: Test benchmark script
         run: |
-          pytest autotest/benchmark -n 4 -m function ${{env.FAIL_CONFIG}} --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          pytest autotest/benchmark -n 4 -m function --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
           mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
       - name: Clear workfile
         if: always()
diff --git a/.github/workflows/daily_ete_test_3090.yml b/.github/workflows/daily_ete_test_3090.yml
index fd64cfbdd2..c322a9fd20 100644
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -53,7 +53,6 @@ env:
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
   RUN_ID: ${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
-  TEST_ENV: 3090
 
 jobs:
   linux-build:
@@ -153,6 +152,7 @@ jobs:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
       MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+      TEST_ENV: 3090_legacy
     container:
       image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -178,6 +178,7 @@ jobs:
       - name: Check env
         run: |
           python3 -m pip list
+          pip install transformers==4.57.6
           lmdeploy check_env
           rm -rf allure-results
           # remove tmp log in testcase
@@ -215,6 +216,7 @@ jobs:
       fail-fast: false
       matrix:
         backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        transformers: ["3090", "3090_legacy"]
         model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
         function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
         exclude:
@@ -228,6 +230,7 @@ jobs:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
       MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+      TEST_ENV: ${{matrix.transformers}}
     container:
       image: openmmlab/lmdeploy:latest-cu12
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -249,6 +252,10 @@ jobs:
         run: |
           python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
+      - name: Downgrade transformers
+        if: ${{matrix.transformers == '3090_legacy'}}
+        run: |
+          pip install transformers==4.57.6
       - name: Check env
         run: |
           python3 -m pip list
@@ -294,6 +301,7 @@ jobs:
       fail-fast: false
       matrix:
         backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        transformers: ["3090", "3090_legacy"]
         model_path: ['internlm/internlm3-8b-instruct', 'Qwen/Qwen3-8B']
         include:
           - tp: 1
@@ -315,6 +323,8 @@ jobs:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    env:
+      TEST_ENV: ${{matrix.transformers}}
     steps:
       - name: Copy repository and Artifacts
         run: |
@@ -328,6 +338,10 @@ jobs:
         run: |
           python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
+      - name: Downgrade transformers
+        if: ${{matrix.transformers == '3090_legacy'}}
+        run: |
+          pip install transformers==4.57.6
       - name: Check env
         run: |
           python3 -m pip list
diff --git a/.github/workflows/daily_ete_test_5080.yml b/.github/workflows/daily_ete_test_5080.yml
index 093463db1e..f32d5f0590 100644
--- a/.github/workflows/daily_ete_test_5080.yml
+++ b/.github/workflows/daily_ete_test_5080.yml
@@ -53,7 +53,6 @@ env:
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
   RUN_ID: ${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
-  TEST_ENV: 5080
 
 jobs:
   linux-build:
@@ -153,6 +152,7 @@ jobs:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
       MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+      TEST_ENV: 5080_legacy
     container:
       image: openmmlab/lmdeploy:latest-cu12.8
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -177,6 +177,7 @@ jobs:
           python3 -m pip install -r requirements/test.txt
       - name: Check env
         run: |
+          pip install transformers==4.57.6
           for i in $(seq 1 10); do
             output=$(lmdeploy check_env 2>&1)
             if echo "$output" | grep -q "CUDA available: False"; then
@@ -225,6 +226,7 @@ jobs:
       matrix:
         backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
         model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
+        transformers: ["5080", "5080_legacy"]
         function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
         exclude:
           - backend: turbomind
@@ -237,6 +239,7 @@ jobs:
       PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
       MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
       MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+      TEST_ENV: ${{ matrix.transformers }}
     container:
       image: openmmlab/lmdeploy:latest-cu12.8
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
@@ -258,6 +261,10 @@ jobs:
         run: |
           python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
+      - name: Downgrade transformers
+        if: ${{matrix.transformers == '5080_legacy'}}
+        run: |
+          pip install transformers==4.57.6
       - name: Check env
         run: |
           for i in $(seq 1 10); do
@@ -313,6 +320,7 @@ jobs:
       matrix:
         backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
         model_path: ['meta-llama/Llama-3.2-3B-Instruct', 'Qwen/Qwen3-4B']
+        transformers: ["5080", "5080_legacy"]
         include:
           - tp: 1
             model: Llama-3.2-3B-Instruct
@@ -334,6 +342,8 @@ jobs:
         - /nvme/qa_test_models:/nvme/qa_test_models
         - /mnt/3090:/mnt/3090
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    env:
+      TEST_ENV: ${{ matrix.transformers }}
     steps:
       - name: Copy repository and Artifacts
         run: |
@@ -347,6 +357,10 @@ jobs:
         run: |
           python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
+      - name: Downgrade transformers
+        if: ${{matrix.transformers == '5080_legacy'}}
+        run: |
+          pip install transformers==4.57.6
       - name: Check env
         run: |
           for i in $(seq 1 10); do
diff --git a/.github/workflows/daily_ete_test_h800.yml b/.github/workflows/daily_ete_test_h800.yml
deleted file mode 100644
index b61123f6ef..0000000000
--- a/.github/workflows/daily_ete_test_h800.yml
+++ /dev/null
@@ -1,355 +0,0 @@
-name: daily_ete_test_h800
-
-on:
-  workflow_dispatch:
-    inputs:
-      repo_org:
-        required: false
-        description: 'Tested repository organization name. Default is InternLM'
-        type: string
-        default: 'InternLM/lmdeploy'
-      repo_ref:
-        required: false
-        description: 'Set branch or tag or commit id. Default is "main"'
-        type: string
-        default: 'main'
-      backend:
-        required: true
-        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
-        type: string
-        default: "['turbomind', 'pytorch']"
-      model:
-        required: true
-        description: 'Set testcase module filter: llm, mllm. Default contains all models'
-        type: string
-        default: "['llm','mllm']"
-      function:
-        required: true
-        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
-        type: string
-        default: '["pipeline", "restful", "chat"]'
-      offline_mode:
-        required: true
-        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
-        type: boolean
-        default: false
-      regression_func:
-        required: true
-        description: 'regression functions'
-        type: string
-        default: "['tools','restful']"
-  schedule:
-    - cron:  '00 14 * * 0-4'
-
-env:
-  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
-  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
-  OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
-  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
-  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
-  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
-  FAIL_CONFIG: ${{ github.event_name == 'schedule' && github.run_attempt != 1 && '--lf --lfnf none' || '--lf'}}
-  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
-  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
-  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
-  RUN_ID: ${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
-  TEST_ENV: h800
-
-jobs:
-  linux-build:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
-    strategy:
-      matrix:
-        pyver: [py310]
-    runs-on: ubuntu-latest
-    env:
-      PYTHON_VERSION: ${{ matrix.pyver }}
-      PLAT_NAME: manylinux2014_x86_64
-      DOCKER_TAG: cuda12.8
-    steps:
-      - name: Free disk space
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
-          tool-cache: false
-          docker-images: false
-          # All of these default to true, but feel free to set to "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          swap-storage: false
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Build
-        run: |
-          echo ${PYTHON_VERSION}
-          echo ${PLAT_NAME}
-          echo ${DOCKER_TAG}
-          echo ${OUTPUT_FOLDER}
-          echo ${GITHUB_RUN_ID}
-          # remove -it
-          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
-          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
-      - name: Upload Artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          if-no-files-found: error
-          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
-          retention-days: 1
-          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
-
-
-  download_pkgs:
-    needs: linux-build
-    if: ${{!cancelled()}}
-    runs-on: [self-hosted, h800-r1]
-    timeout-minutes: 50
-    container:
-      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /nvme1/qa_test_models:/nvme1/qa_test_models
-        - /nvme2/share:/nvme2/share
-        - /mnt/137_nvme4:/mnt/137_nvme4
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    steps:
-      - name: Clone repository
-        uses: actions/checkout@v2
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Copy repository
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
-      - name: Copy repository - offline
-        if: ${{inputs.offline_mode}}
-        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
-      - name: Download Artifacts
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}-py310
-      - name: Copy Artifacts
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
-      - name: Copy Artifacts - offline
-        if: ${{inputs.offline_mode}}
-        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
-      - name: Mark as start
-        run: |
-          mkdir ${{env.REPORT_DIR}} -p
-          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
-
-  test_tools:
-    if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
-    runs-on: [self-hosted, h800-r1]
-    needs: download_pkgs
-    timeout-minutes: 300
-    strategy:
-      fail-fast: false
-      matrix:
-        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
-        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
-        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
-        exclude:
-          - backend: turbomind
-            model: mllm
-            function: chat
-          - backend: pytorch
-            model: mllm
-            function: chat
-    env:
-      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
-      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
-      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
-    container:
-      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /nvme1/qa_test_models:/nvme1/qa_test_models
-        - /nvme2/share:/nvme2/share
-        - /mnt/137_nvme2:/mnt/137_nvme2
-        - /mnt/137_nvme3:/mnt/137_nvme3
-        - /mnt/137_nvme4:/mnt/137_nvme4
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    steps:
-      - name: Copy repository and Artifacts
-        run: |
-          cp -r ${{env.TEST_CODE_PATH}}/. .
-      - name: Install lmdeploy - dependency
-        run: |
-          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
-      - name: Install lmdeploy
-        run: |
-          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
-      - name: Check env
-        run: |
-          python3 -m pip list
-          lmdeploy check_env
-          rm -rf allure-results
-          # remove tmp log in testcase
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
-          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
-      - name: Test lmdeploy - chat
-        continue-on-error: true
-        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
-        run: |
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - pipeline
-        continue-on-error: true
-        if: matrix.function == 'pipeline'
-        run: |
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Test lmdeploy - restful
-        continue-on-error: true
-        if: matrix.function == 'restful'
-        run: |
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test and not other' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test and not other' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test and not other' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test and not other' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Clear workfile
-        if: always()
-        run: |
-          chmod -R 777 $REPORT_DIR
-          export workdir=$(pwd)
-          cd ..
-          rm -rf $workdir
-          mkdir $workdir
-          chmod -R 777 $workdir
-
-  test_restful:
-    if: ${{!cancelled() && !contains(needs.download_pkgs.result, 'fail') && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
-    runs-on: [self-hosted, h800-r1]
-    needs: download_pkgs
-    strategy:
-      fail-fast: false
-      matrix:
-        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
-        model: ['Intern-S1']
-        include:
-          - tp: 8
-            model: Intern-S1
-    timeout-minutes: 60
-    container:
-      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /nvme1/qa_test_models:/nvme1/qa_test_models
-        - /nvme2/share:/nvme2/share
-        - /mnt/137_nvme2:/mnt/137_nvme2
-        - /mnt/137_nvme3:/mnt/137_nvme3
-        - /mnt/137_nvme4:/mnt/137_nvme4
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    steps:
-      - name: Copy repository and Artifacts
-        run: |
-          cp -r ${{env.TEST_CODE_PATH}}/. .
-      - name: Install lmdeploy - dependency
-        run: |
-          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
-      - name: Install lmdeploy
-        run: |
-          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
-      - name: Check env
-        run: |
-          python3 -m pip list
-          lmdeploy check_env
-          rm -rf allure-results
-          # remove tmp log in testcase
-          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
-          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
-      - name: Start restful api
-        if:  matrix.model != 'internlm2_5-20b'
-        run: |
-          lmdeploy serve api_server /nvme/qa_test_models/internlm/${{matrix.model}} --tp ${{matrix.tp}} --backend ${{matrix.backend}} > ${{env.REPORT_DIR}}/${{matrix.backend}}_${{matrix.model}}_start_chat_restful.log 2>&1 &
-          echo "restful_pid=$!" >> "$GITHUB_ENV"
-          sleep 900s
-      - name: Test lmdeploy - restful api
-        if:  matrix.model == 'Intern-S1'
-        timeout-minutes: 30
-        run: |
-          pytest autotest/interface/restful/test_restful_chat_func.py -n 20 -m 'not not_${{matrix.backend}} and not internlm2_5' --alluredir=${{env.REPORT_DIR}}/interface-${{matrix.model}}-${{matrix.backend}}_ ${{env.COV_PARAM}} || true
-          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
-      - name: Kill api server
-        if:  matrix.model != 'internlm2_5-20b'
-        run: |
-          kill -15 "$restful_pid"
-      - name: Clear workfile
-        if: always()
-        run: |
-          chmod -R 777 $REPORT_DIR
-          chmod -R 777 ${{env.REPORT_DIR}}
-          export workdir=$(pwd)
-          cd ..
-          rm -rf $workdir
-          mkdir $workdir
-          chmod -R 777 $workdir
-
-  get_coverage_report:
-    if: ${{!cancelled()}}
-    runs-on: [self-hosted, h800-r1]
-    needs: [test_tools, test_restful]
-    timeout-minutes: 5
-    container:
-      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /nvme1/qa_test_models:/nvme1/qa_test_models
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    steps:
-      - name: Copy repository and Artifacts
-        run: cp -r ${{env.TEST_CODE_PATH}}/. .
-      - name: Install lmdeploy
-        run: |
-          echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
-          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
-      - name: Get coverage report
-        run: |
-          pip install coverage
-          coverage combine ${{env.REPORT_DIR}}
-          coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
-          coverage report -m
-          mv .coverage ${{env.REPORT_DIR}}/.coverage
-      - name: Clear workfile
-        if: always()
-        run: |
-          chmod -R 777 $REPORT_DIR
-          export workdir=$(pwd)
-          cd ..
-          rm -rf $workdir
-          mkdir $workdir
-          chmod -R 777 $workdir
diff --git a/.github/workflows/daily_ete_test_legacy.yml b/.github/workflows/daily_ete_test_legacy.yml
new file mode 100644
index 0000000000..bcaefbc244
--- /dev/null
+++ b/.github/workflows/daily_ete_test_legacy.yml
@@ -0,0 +1,561 @@
+name: daily_ete_test_legacy
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch']"
+      model:
+        required: true
+        description: 'Set testcase module filter: llm, mllm. Default contains all models'
+        type: string
+        default: "['llm','mllm']"
+      function:
+        required: true
+        description: 'Set testcase function filter: chat, restful, pipeline. Default contains all functions'
+        type: string
+        default: '["pipeline", "restful", "chat"]'
+      offline_mode:
+        required: true
+        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
+        type: boolean
+        default: false
+      regression_func:
+        required: true
+        description: 'regression functions'
+        type: string
+        default: "['quant', 'tools','restful','pipeline','evaluation']"
+  schedule:
+    - cron:  '00 14 * * 0-4'
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  REPORT_DIR: /nvme/qa_test_models/test-reports/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
+  RUN_ID: ${{ inputs.repo_ref || 'main' }}_${{ github.run_id }}
+  TEST_ENV: legacy
+
+jobs:
+  linux-build:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || !inputs.offline_mode)}}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.8
+    steps:
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+      - name: Checkout repository
+        uses: actions/checkout@v6
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+
+  download_pkgs:
+    needs: linux-build
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, linux-a100]
+    timeout-minutes: 50
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Mark as start
+        run: |
+          chmod -R 777 ${{env.TEST_CODE_PATH}}
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+
+  test_quantization:
+    needs: download_pkgs
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'quant') )}}
+    runs-on: [self-hosted, linux-a100]
+    timeout-minutes: 150
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme/huggingface_hub:/nvme/huggingface_hub
+        - /mnt/121:/mnt/121
+        - /mnt/104:/mnt/104
+        - /mnt/bigdisk:/mnt/bigdisk
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install auto_gptq matplotlib attrdict
+          python3 -m pip install -r requirements/lite.txt
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
+          rm -rf ${{env.DEEPSEEK_VL}}/build
+      - name: Check env
+        run: |
+          pip install transformers==4.57.6
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - quantization w4a16
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'turbomind')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_awq.py -m 'not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} --clean-alluredir ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - quantization w8a8
+        continue-on-error: true
+        if: github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.backend), 'pytorch')
+        run: |
+          pytest autotest/tools/quantization/test_quantization_w8a8.py -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_tools:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'tools'))}}
+    runs-on: [self-hosted, linux-a100]
+    needs: test_quantization
+    timeout-minutes: 300
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model: ${{ fromJSON(inputs.model || '["llm", "mllm"]')}}
+        function: ${{ fromJSON(inputs.function || '["pipeline","restful","chat"]')}}
+        exclude:
+          - backend: turbomind
+            model: mllm
+            function: chat
+          - backend: pytorch
+            model: mllm
+            function: chat
+        include:
+          - backend: turbomind
+            model: llm
+            function: other
+    env:
+      PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
+      MODELSCOPE_CACHE: /nvme/qa_test_models/modelscope_hub
+      MODELSCOPE_MODULES_CACHE: /nvme/qa_test_models/modelscope_modules
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme/huggingface_hub:/nvme/huggingface_hub
+        - /mnt/121:/mnt/121
+        - /mnt/104:/mnt/104
+        - /mnt/bigdisk:/mnt/bigdisk
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
+          rm -rf ${{env.DEEPSEEK_VL}}/build
+      - name: Check env
+        run: |
+          pip install transformers==4.57.6
+          python3 -m pip list
+          lmdeploy check_env
+          cp -r /nvme/qa_test_models/offline_pkg/lora .
+          rm -rf allure-results
+          # remove tmp log in testcase
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - chat
+        continue-on-error: true
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind') && matrix.model == 'llm' && matrix.function == 'chat'
+        run: |
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/chat/test_command_chat_hf_${{matrix.backend}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - pipeline
+        continue-on-error: true
+        if: matrix.function == 'pipeline'
+        run: |
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/pipeline/test_pipeline_chat_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - restful
+        continue-on-error: true
+        if: matrix.function == 'restful'
+        run: |
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/tools/restful/test_restful_chat_hf_${{matrix.backend}}_${{matrix.model}}.py -m 'gpu_num_8 and not pr_test' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - local testcase
+        if: matrix.backend == 'turbomind' && matrix.model == 'llm' && matrix.function == 'other'
+        run: |
+          pytest autotest/toolchain --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_restful:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'restful'))}}
+    runs-on: [self-hosted, linux-a100]
+    needs: test_quantization
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        model_path: ['internlm/Intern-S1']
+        include:
+          - tp: 8
+            model: Intern-S1
+            model_path: internlm/Intern-S1
+            case_info: ['chat_completions_v1', 'generate']
+            generate_type: base
+    timeout-minutes: 60
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme/huggingface_hub:/nvme/huggingface_hub
+        - /mnt/121:/mnt/121
+        - /mnt/104:/mnt/104
+        - /mnt/bigdisk:/mnt/bigdisk
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Check env
+        run: |
+          pip install transformers==4.57.6
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Start restful api
+        run: |
+          lmdeploy serve api_server /nvme/qa_test_models/${{matrix.model_path}} --tp ${{matrix.tp}} --backend ${{matrix.backend}} ${{matrix.extra}} --allow-terminate-by-client > ${{env.REPORT_DIR}}/${{matrix.backend}}_${{matrix.model}}_${{matrix.generate_type}}_start_restful.log 2>&1 &
+          echo "restful_pid=$!"
+          for i in $(seq 1 240)
+          do
+            sleep 5
+            echo "health check try $i"
+            if curl -f -s http://127.0.0.1:23333/health > /dev/null 2>&1; then
+              echo "health check success"
+              exit 0
+            fi
+          done
+
+          echo "health check fail"
+          curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
+          exit 1
+      - name: Test lmdeploy - chat_completions_v1
+        if:  matrix.model != 'internlm2_5-20b-chat' && matrix.model != 'Intern-S1' && contains(matrix.case_info, 'chat_completions_v1')
+        timeout-minutes: 60
+        run: |
+          pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}} and not internlm2_5 and not interns1' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - chat_completions_v1
+        if: matrix.model == 'Intern-S1' && contains(matrix.case_info, 'chat_completions_v1')
+        timeout-minutes: 60
+        run: |
+          pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}} and not internlm2_5' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - chat_completions_v1 - internlm2_5-20b-chat
+        if:  matrix.model == 'internlm2_5-20b-chat' && contains(matrix.case_info, 'chat_completions_v1')
+        timeout-minutes: 60
+        run: |
+          pytest autotest/interface/restful/test_restful_chat_completions_v1.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}} and not interns1' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - completions_v1 - internlm2_5-20b
+        if: matrix.model == 'internlm2_5-20b' && contains(matrix.case_info, 'completions_v1')
+        timeout-minutes: 60
+        run: |
+          pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test lmdeploy - completions_v1 - other
+        if: matrix.model != 'internlm2_5-20b' && contains(matrix.case_info, 'completions_v1')
+        timeout-minutes: 60
+        run: |
+          pytest autotest/interface/restful/test_restful_completions_v1.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}} and not internlm2_5' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test generate - base
+        if:  matrix.generate_type == 'base' && contains(matrix.case_info, 'generate')
+        timeout-minutes: 60
+        run: |
+          pytest autotest/interface/restful/test_restful_generate.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}} and not logprob and not experts' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test generate - logprob
+        if:  matrix.generate_type == 'logprob' && contains(matrix.case_info, 'generate')
+        timeout-minutes: 60
+        run: |
+          pytest autotest/interface/restful/test_restful_generate.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}} and not experts' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Test generate - all
+        if:  matrix.generate_type == 'all' && contains(matrix.case_info, 'generate')
+        timeout-minutes: 60
+        run: |
+          pytest autotest/interface/restful/test_restful_generate.py -n 20 -k '${{matrix.model_path}} and ${{matrix.backend}}' -m 'not not_${{matrix.backend}}' --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Kill api server
+        if: always()
+        run: |
+          curl -f -s http://127.0.0.1:23333/terminate > /dev/null 2>&1
+      - name: Clear workfile
+        if: always()
+        run: |
+          echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  test_pipeline:
+    if: ${{!cancelled() && (github.event_name == 'schedule' || contains(fromJSON(github.event.inputs.regression_func), 'pipeline'))}}
+    runs-on: [self-hosted, linux-a100]
+    needs: test_quantization
+    timeout-minutes: 240
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme/huggingface_hub:/nvme/huggingface_hub
+        - /mnt/121:/mnt/121
+        - /mnt/104:/mnt/104
+        - /mnt/bigdisk:/mnt/bigdisk
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r ${{env.OFFLINE_REQUIREMENTS}}
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+          pip install ${{env.DEEPSEEK_VL}} --no-deps
+          rm -rf ${{env.DEEPSEEK_VL}}/build
+      - name: Check env
+        run: |
+          pip install transformers==4.57.6
+          python3 -m pip list
+          lmdeploy check_env
+          rm -rf allure-results
+          # remove tmp log in testcase
+          mkdir ${{env.REPORT_DIR}}/.pytest_cache -p && rm autotest/.pytest_cache -f
+          ln -s ${{env.REPORT_DIR}}/.pytest_cache autotest
+      - name: Test lmdeploy - interface pipeline case
+        run: |
+          pytest autotest/interface/pipeline/test_pipeline_func.py -m 'not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_1 and not pr_test' -n 8 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_2 and not pr_test' -n 4 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S') || true
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_4 and not pr_test' -n 2 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+          pytest autotest/interface/pipeline/test_pipeline_longtext_func.py -m 'gpu_num_8 and not pr_test' -n 1 --alluredir=${{env.REPORT_DIR}} ${{env.COV_PARAM}} || true
+          mv .coverage ${{env.REPORT_DIR}}/.coverage.$(date +'%Y%m%d%H%M%S')
+      - name: Clear workfile
+        if: always()
+        run: |
+          echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
+
+  get_coverage_report:
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, linux-a100]
+    needs: [test_tools, test_restful, test_pipeline]
+    timeout-minutes: 5
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: cp -r ${{env.TEST_CODE_PATH}}/. .
+      - name: Install lmdeploy
+        run: |
+          echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Get coverage report
+        run: |
+          pip install coverage
+          coverage combine ${{env.REPORT_DIR}}
+          coverage xml -o ${{env.REPORT_DIR}}/coverage.xml
+          coverage report -m
+          mv .coverage ${{env.REPORT_DIR}}/.coverage
+      - name: Clear workfile
+        if: always()
+        run: |
+          chmod -R 777 $REPORT_DIR
+          export workdir=$(pwd)
+          cd ..
+          rm -rf $workdir
+          mkdir $workdir
+          chmod -R 777 $workdir
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 0d87c58ccf..a1c31d2c03 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -15,9 +15,9 @@ on:
         default: 'main'
       base_models:
         required: true
-        description: 'Tested TurboMind models list. eg. [turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_internlm2_5_7b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_internlm2_5_7b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b]'
+        description: 'Tested TurboMind models list. eg. [turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]'
         type: string
-        default: '[turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_internlm2_5_7b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_internlm2_5_7b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b]'
+        default: '[turbomind_qwen2_5_1_5b, turbomind_qwen2_5_7b, turbomind_qwen2_5_32b, turbomind_glm_4_9b, turbomind_llama_3_1_8b, turbomind_llama_3_70b, turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen2_5_1_5b, pytorch_qwen2_5_7b, pytorch_qwen2_5_32b, pytorch_gemma_2_9b, pytorch_llama_3_70b, pytorch_llama_3_1_8b, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]'
       baes_datasets:
         required: true
         description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]'
@@ -69,7 +69,7 @@ jobs:
           large-packages: true
           swap-storage: false
       - name: Checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v6
         with:
           repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
           ref: ${{github.event.inputs.repo_ref || 'main'}}
diff --git a/.github/workflows/evaluate_h800.yml b/.github/workflows/evaluate_h800.yml
deleted file mode 100644
index ba10bc78fe..0000000000
--- a/.github/workflows/evaluate_h800.yml
+++ /dev/null
@@ -1,166 +0,0 @@
-name: evaluate_h800
-
-on:
-  workflow_dispatch:
-    inputs:
-      repo_org:
-        required: false
-        description: 'Tested repository organization name. Default is InternLM/lmdeploy'
-        type: string
-        default: 'InternLM/lmdeploy'
-      repo_ref:
-        required: false
-        description: 'Set branch or tag or commit id. Default is "main"'
-        type: string
-        default: 'main'
-      base_models:
-        required: true
-        description: 'Tested TurboMind models list. eg. [turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]'
-        type: string
-        default: '[turbomind_qwen3_0_6b_base, turbomind_qwen3_8b_base, turbomind_qwen3_30b_A3B_base, pytorch_qwen3_0_6b_base, pytorch_qwen3_8b_base, pytorch_qwen3_30b_A3B_base]'
-      baes_datasets:
-        required: true
-        description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]'
-        type: string
-        default: '[*race_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]'
-      oc_repo_org:
-        required: false
-        description: 'Tested repository organization name. Default is open-compass/opencompass'
-        type: string
-        default: 'open-compass/opencompass'
-      oc_repo_ref:
-        required: false
-        description: 'Set branch or tag or commit id. Default is "main"'
-        type: string
-        default: 'main'
-      offline_mode:
-        required: true
-        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
-        type: boolean
-        default: false
-
-env:
-  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
-  COMPASS_DATA_CACHE: /nvme/qa_test_models/compass_data_cache
-
-jobs:
-  linux-build:
-    if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
-    strategy:
-      matrix:
-        pyver: [py310]
-    runs-on: ubuntu-latest
-    env:
-      PYTHON_VERSION: ${{ matrix.pyver }}
-      PLAT_NAME: manylinux2014_x86_64
-      DOCKER_TAG: cuda12.8
-      OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Build
-        run: |
-          echo ${PYTHON_VERSION}
-          echo ${PLAT_NAME}
-          echo ${DOCKER_TAG}
-          echo ${OUTPUT_FOLDER}
-          echo ${GITHUB_RUN_ID}
-          # remove -it
-          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
-          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
-      - name: Upload Artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          if-no-files-found: error
-          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
-          retention-days: 1
-          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
-
-  evaluate:
-    needs: linux-build
-    if: ${{github.event_name == 'schedule' || !cancelled()}}
-    runs-on: [self-hosted, h800-r1]
-    timeout-minutes: 4320 # 72hours
-    strategy:
-      fail-fast: false
-      matrix:
-        evaluate_type: ['base']
-    container:
-      image: m.daocloud.io/docker.io/openmmlab/lmdeploy:latest-cu12.8
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
-      volumes:
-        - /nvme/github-actions/pip-cache:/root/.cache/pip
-        - /nvme/github-actions/packages:/root/packages
-        - /nvme/github-actions/resources:/root/resources
-        - /nvme/qa_test_models:/nvme/qa_test_models
-        - /nvme1/qa_test_models:/nvme1/qa_test_models
-        - /nvme2/share:/nvme2/share
-        - /mnt/137_nvme2:/mnt/137_nvme2
-        - /mnt/137_nvme3:/mnt/137_nvme3
-        - /mnt/137_nvme4:/mnt/137_nvme4
-        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
-    steps:
-      - name: Setup systems
-        run: |
-          export TIME_STAMP="$(date +'%Y%m%d-%H%M%S')"
-          echo "TIME_STAMP=$TIME_STAMP" >> $GITHUB_ENV
-      - name: Clone repository
-        uses: actions/checkout@v2
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Copy repository - offline
-        if: ${{inputs.offline_mode}}
-        run: cp -r /root/models/offline_pkg/lmdeploy/. .
-      - name: Download Artifacts
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}-py310
-      - name: Install lmdeploy - dependency
-        run: |
-          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
-      - name: Install lmdeploy
-        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
-        run: |
-          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
-      - name: Install lmdeploy - offline
-        if: ${{inputs.offline_mode}}
-        run: |
-          python3 -m pip install /root/models/offline_pkg/py310/lmdeploy-*.whl --no-deps
-          python3 -m pip install -r requirements/test.txt
-      - name: Install opencompass
-        run: |
-          git clone https://github.com/${{ github.event.inputs.oc_repo_org}}.git
-          cd opencompass
-          git checkout ${{ github.event.inputs.oc_repo_ref}}
-          python3 -m pip install .
-          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
-      - name: Check env
-        run: |
-          python3 -m pip list
-          lmdeploy check_env
-      - name: Setup paths for evaluation
-        run: |
-          ln -s /nvme/qa_test_models/opencompass-data ./data
-          python3 .github/scripts/action_tools.py create_model_links /nvme/qa_test_models .
-      - name: Evaluate base models
-        if: matrix.evaluate_type == 'base'
-        run: |
-          echo ${{github.event.inputs.base_models}}
-          echo ${{github.event.inputs.baes_datasets}}
-          export LMDEPLOY_DIR=$(pwd)
-          python3 .github/scripts/action_tools.py evaluate "${{github.event.inputs.base_models}}" "${{github.event.inputs.baes_datasets}}" /root/evaluation_report/${{ github.run_id }} base
-      - name: Clear workspace
-        if: always()
-        run: |
-          export workdir=$(pwd)
-          cd ..
-          rm -rf $workdir
-          mkdir $workdir
-          chmod -R 777 $workdir
diff --git a/.github/workflows/evaluate_remote.yml b/.github/workflows/evaluate_remote.yml
deleted file mode 100644
index 200fea5983..0000000000
--- a/.github/workflows/evaluate_remote.yml
+++ /dev/null
@@ -1,217 +0,0 @@
-name: evaluate_remote
-
-on:
-  workflow_dispatch:
-    inputs:
-      repo_org:
-        required: false
-        description: 'Tested repository organization name. Default is open-compass/opencompass'
-        type: string
-        default: 'open-compass/opencompass'
-      repo_ref:
-        required: false
-        description: 'Set branch or tag or commit id. Default is "main"'
-        type: string
-        default: 'main'
-      build_lmdeploy:
-        required: false
-        description: 'whether to build lmdeploy'
-        type:  boolean
-        default: true
-      repo_org_lmdeploy:
-        required: false
-        description: 'Tested repository organization name. Default is internlm/lmdeploy'
-        type: string
-        default: 'InternLM/lmdeploy'
-      repo_ref_lmdeploy:
-        required: false
-        description: 'Set branch or tag or commit id. Default is "main"'
-        type: string
-        default: 'main'
-      regression_func_volc:
-        required: true
-        description: 'regression functions'
-        type: string
-        default: "['chat_models','base_models']"
-      backend:
-        required: true
-        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
-        type: string
-        default: "['turbomind', 'pytorch']"
-
-env:
-  HF_DATASETS_OFFLINE: 1
-  HF_EVALUATE_OFFLINE: 1
-  TRANSFORMERS_OFFLINE: 1
-  LMDEPLOY_USE_MODELSCOPE: false
-  HF_HUB_OFFLINE: 1
-  OUTPUT_FOLDER: cuda12.4_dist_${{ github.run_id }}
-  CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
-  PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
-  REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/lmdeploy_regression
-  COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
-  HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
-  HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
-  CONDA_ENV: lmdeploy_test
-
-jobs:
-  build-pypi:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-      - name: Build Opencompass
-        run: |
-          pip install wheel setuptools
-          python setup.py sdist bdist_wheel
-      - name: Upload Artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          if-no-files-found: error
-          path: dist/*
-          retention-days: 1
-          name: my-artifact-${{ github.run_id }}
-
-  build-pypi-lmdeploy:
-    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.build_lmdeploy)}}
-    strategy:
-      matrix:
-        pyver: [py310]
-    runs-on: ubuntu-latest
-    env:
-      PYTHON_VERSION: ${{ matrix.pyver }}
-      PLAT_NAME: manylinux2014_x86_64
-      DOCKER_TAG: cuda12.4
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}}
-      - name: Build
-        run: |
-          echo ${PYTHON_VERSION}
-          echo ${PLAT_NAME}
-          echo ${DOCKER_TAG}
-          echo ${OUTPUT_FOLDER}
-          echo ${GITHUB_RUN_ID}
-          # remove -it
-          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
-          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
-      - name: Upload Artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          if-no-files-found: error
-          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
-          retention-days: 1
-          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
-
-
-  prepare_env:
-    if: ${{!cancelled()}}
-    needs: ['build-pypi', 'build-pypi-lmdeploy']
-    runs-on: lmdeploy-volc
-    timeout-minutes: 120 #2hours
-    steps:
-      - name: Clone repository
-        uses: actions/checkout@v2
-        with:
-          repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
-          ref: ${{github.event.inputs.repo_ref || 'main'}}
-      - name: Download Artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}
-      - name:  Remove Conda Env
-        if: always()
-        run: |
-          . /fs-computility/llm/qa-llm-cicd/miniconda3/bin/activate
-          conda env remove -y --name ${{env.CONDA_ENV}}
-          conda info --envs
-      - name: Prepare - create conda env and install torch - cu12
-        uses: nick-fields/retry@v3
-        with:
-          max_attempts: 1
-          timeout_minutes: 120
-          command: |
-            . ${{env.CONDA_PATH}}/bin/activate
-            conda create -y --name ${{env.CONDA_ENV}} python=3.10
-            conda activate ${{env.CONDA_ENV}}
-            unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
-            pip install -r /fs-computility/llm/qa-llm-cicd/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
-            FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-            cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
-      - name: Prepare - download lmdeploy - cu12
-        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
-        uses: actions/download-artifact@v4
-        with:
-          name: my-artifact-${{ github.run_id }}-py310
-      - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
-        run: |
-          . ${{env.CONDA_PATH}}/bin/activate
-          conda activate ${{env.CONDA_ENV}}
-          pip uninstall -y lmdeploy
-          pip install lmdeploy-*.whl --no-deps
-      - name: conda env
-        run: |
-          . ${{env.CONDA_PATH}}/bin/activate
-          conda activate ${{env.CONDA_ENV}}
-          conda info --envs
-          pip list
-
-  run_test_volc:
-    if: ${{!cancelled()}}
-    needs: prepare_env
-    strategy:
-      fail-fast: false
-      matrix:
-        regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models"]')}}
-        backend: ${{ fromJSON(github.event.inputs.backend || '["turbomind", "pytorch"]')}}
-    runs-on: lmdeploy-volc
-    timeout-minutes: 480 #6hours
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }}
-          ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}}
-      - name:  modify config
-        run: |
-          mkdir opencompass
-          cp .github/scripts/eval_regression_${{matrix.regression_func}}.py opencompass/eval_regression_${{matrix.regression_func}}.py
-          cp -r /fs-computility/llm/qa-llm-cicd/ocplayground/template/configs_cluster/volc.py opencompass
-          cat /fs-computility/llm/qa-llm-cicd/config/lmdeploy_test_config.txt >> opencompass/eval_regression_${{matrix.regression_func}}.py
-      - name:  modify config - models filter
-        if: matrix.backend == 'turbomind'
-        run: |
-          echo "models = sum([v for k, v in locals().items() if  k.startswith('lmdeploy_')], [])" >> opencompass/eval_regression_${{matrix.regression_func}}.py
-      - name:  modify config - models filter
-        if: matrix.backend == 'pytorch'
-        run: |
-          echo "models = sum([v for k, v in locals().items() if  k.startswith('pytorch_')], [])" >> opencompass/eval_regression_${{matrix.regression_func}}.py
-      - name:  Run test
-        uses: nick-fields/retry@v3
-        with:
-          max_attempts: 1
-          timeout_minutes: 180
-          command: |
-            . ${{env.CONDA_PATH}}/bin/activate
-            conda activate ${{env.CONDA_ENV}}
-            conda info --envs
-            unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
-            echo "models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])" >> opencompass/eval_regression_${{matrix.regression_func}}.py
-            cd opencompass
-            opencompass eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
-            cd ..
-            python .github/scripts/action_tools.py generate_output_for_evaluation ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}
diff --git a/.github/workflows/mllm_api_eval.yml b/.github/workflows/mllm_api_eval.yml
index a9b7921c8e..75220d794b 100644
--- a/.github/workflows/mllm_api_eval.yml
+++ b/.github/workflows/mllm_api_eval.yml
@@ -40,7 +40,6 @@ env:
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   REPORT_DIR: /nvme/qa_test_models/mllm_evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }}
   COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
-  FAIL_CONFIG: '--lf'
   TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }}
   OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
   OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
diff --git a/.github/workflows/mllm_api_eval_legacy.yml b/.github/workflows/mllm_api_eval_legacy.yml
new file mode 100644
index 0000000000..86fd787b0a
--- /dev/null
+++ b/.github/workflows/mllm_api_eval_legacy.yml
@@ -0,0 +1,216 @@
+name: mllm_api_eval_legacy
+
+on:
+  workflow_dispatch:
+    inputs:
+      repo_org:
+        required: false
+        description: 'Tested repository organization name. Default is InternLM/lmdeploy'
+        type: string
+        default: 'InternLM/lmdeploy'
+      repo_ref:
+        required: false
+        description: 'Set branch or tag or commit id. Default is "main"'
+        type: string
+        default: 'main'
+      backend:
+        required: true
+        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
+        type: string
+        default: "['turbomind', 'pytorch']"
+      execution_mode:
+        required: false
+        description: 'Select execution mode: infer, eval, or both. Default is "both"'
+        type: choice
+        options:
+          - both
+          - infer
+          - eval
+        default: 'both'
+      run_id:
+        required: false
+        description: 'Set custom run ID. If not provided, github.run_id will be used'
+        type: string
+        default: ''
+
+
+env:
+  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
+  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  REPORT_DIR: /nvme/qa_test_models/mllm_evaluation_report/allure_report/${{ inputs.repo_ref }}_${{ github.run_id }}
+  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
+  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ inputs.repo_ref }}_${{ github.run_id }}
+  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
+  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
+  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
+  LMUData: /nvme/qa_test_models/LMUData
+  LOCAL_LLM: turbomind_Qwen2.5-32B-Instruct_nccl_tp2_0
+  OPENAI_API_KEY: sk-empty
+  HF_DATASETS_OFFLINE: 1
+  HF_DATASETS_CACHE: /nvme/qa_test_models/hf_datasets
+  HF_HUB_OFFLINE: 1
+  HF_EVALUATE_OFFLINE: 1
+  RUN_ID: ${{ inputs.repo_ref }}_${{ github.run_id }}
+  TEST_ENV: legacy
+
+jobs:
+  linux-build:
+    if: ${{ !cancelled() }}
+    strategy:
+      matrix:
+        pyver: [py310]
+    runs-on: ubuntu-latest
+    env:
+      PYTHON_VERSION: ${{ matrix.pyver }}
+      PLAT_NAME: manylinux2014_x86_64
+      DOCKER_TAG: cuda12.8
+      OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
+    steps:
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
+          tool-cache: false
+          docker-images: false
+          # All of these default to true, but feel free to set to "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Build
+        run: |
+          echo ${PYTHON_VERSION}
+          echo ${PLAT_NAME}
+          echo ${DOCKER_TAG}
+          echo ${OUTPUT_FOLDER}
+          echo ${GITHUB_RUN_ID}
+          # remove -it
+          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
+          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
+      - name: Upload Artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          if-no-files-found: error
+          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
+          retention-days: 1
+          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
+
+  download_pkgs:
+    needs: linux-build
+    if: ${{!cancelled()}}
+    runs-on: [self-hosted, linux-a100]
+    timeout-minutes: 50
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/121:/mnt/121
+        - /mnt/104:/mnt/104
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        with:
+          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
+          ref: ${{github.event.inputs.repo_ref || 'main'}}
+      - name: Copy repository
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
+      - name: Copy repository - offline
+        if: ${{inputs.offline_mode}}
+        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
+      - name: Download Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        uses: actions/download-artifact@v4
+        with:
+          name: my-artifact-${{ github.run_id }}-py310
+      - name: Copy Artifacts
+        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Copy Artifacts - offline
+        if: ${{inputs.offline_mode}}
+        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
+      - name: Mark as start
+        run: |
+          chmod -R 777 ${{env.TEST_CODE_PATH}}
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+
+  test_evaluation:
+    needs: download_pkgs
+    if: ${{ !cancelled() }}
+    runs-on: [self-hosted, linux-a100]
+    timeout-minutes: 2400
+    strategy:
+      fail-fast: false
+      matrix:
+        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
+        gpu_num: ['gpu_num_1', 'gpu_num_2', 'gpu_num_4', 'gpu_num_8']
+    container:
+      image: openmmlab/lmdeploy:latest-cu12.8
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
+      volumes:
+        - /nvme/github-actions/pip-cache:/root/.cache/pip
+        - /nvme/github-actions/packages:/root/packages
+        - /nvme/github-actions/resources:/root/resources
+        - /nvme/qa_test_models:/nvme/qa_test_models
+        - /nvme/huggingface_hub:/nvme/huggingface_hub
+        - /mnt/121:/mnt/121
+        - /mnt/104:/mnt/104
+        - /mnt/bigdisk:/mnt/bigdisk
+        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
+    steps:
+      - name: Copy repository and Artifacts
+        run: |
+          cp -r ${{env.TEST_CODE_PATH}}/. .
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+      - name: Install lmdeploy - dependency
+        run: |
+          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
+      - name: Install lmdeploy
+        run: |
+          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
+          python3 -m pip install -r requirements/test.txt
+      - name: Install vlmeval
+        run: |
+          python3 -m pip install pandas datasets scikit-learn pylatexenc math_verify
+          apt update && apt install -y libgl1 libglib2.0-0
+          cp -r /nvme/qa_test_models/offline_pkg/VLMEvalKit .
+          cd VLMEvalKit && pip install .
+      - name: Check env
+        run: |
+          pip install transformers==4.57.6
+          python3 -m pip list
+          lmdeploy check_env
+          mkdir ${{env.REPORT_DIR}} -p
+          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
+      - name: Setup paths for evaluation
+        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
+        run: |
+          unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
+          cd VLMEvalKit && cp -r ../autotest .
+          execution_mode="${{ github.event.inputs.execution_mode || 'both' }}"
+          ulimit -n 65535
+          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then
+            pytest autotest/evaluate/test_mllm_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and infer" --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          fi
+          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then
+            pytest autotest/evaluate/test_mllm_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and eval" -n 4 --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
+          fi
+          exit $overall_exit
+      - name: Clear workspace
+        if: always()
+        run: |
+          echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
+          export workdir=$(pwd)
+          rm -rf $workdir/*
diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml
index 328e791dc4..8e567800c7 100644
--- a/.github/workflows/pr_ete_test.yml
+++ b/.github/workflows/pr_ete_test.yml
@@ -69,7 +69,7 @@ jobs:
           pip install transformers==4.57.3
       - name: Test restful server - turbomind Qwen3-32B
         run: |
-          CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-32B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/turbomind_Qwen3-32B_start_restful.log 2>&1 &
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-32B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/turbomind_Qwen3-32B_start_restful.log 2>&1 &
           echo "restful_pid=$!"
           for i in $(seq 1 180)
           do
@@ -89,7 +89,7 @@ jobs:
           exit 1
       - name: Test restful server - turbomind InternVL3-38B
         run: |
-          CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3-38B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/turbomind_InternVL3-38B_start_restful.log 2>&1 &
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3-38B --tp 2 --backend turbomind --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/turbomind_InternVL3-38B_start_restful.log 2>&1 &
           echo "restful_pid=$!"
           for i in $(seq 1 180)
           do
@@ -109,7 +109,7 @@ jobs:
           exit 1
       - name: Test restful server - turbomind Qwen3-30B-A3B
         run: |
-          CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend turbomind --logprobs-mode raw_logprobs  --allow-terminate-by-client> ${{env.SERVER_LOG}}/turbomind_Qwen3-30B-A3B_start_restful.log 2>&1 &
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend turbomind --logprobs-mode raw_logprobs  --allow-terminate-by-client> ${{env.SERVER_LOG}}/turbomind_Qwen3-30B-A3B_start_restful.log 2>&1 &
           echo "restful_pid=$!"
           for i in $(seq 1 180)
           do
@@ -129,7 +129,7 @@ jobs:
           exit 1
       - name: Test restful server - pytorch Qwen3-30B-A3B
         run: |
-          CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --enable-return-routed-experts --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_Qwen3-30B-A3B_start_restful.log 2>&1 &
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs --enable-return-routed-experts --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_Qwen3-30B-A3B_start_restful.log 2>&1 &
           echo "restful_pid=$!"
           for i in $(seq 1 180)
           do
@@ -149,7 +149,7 @@ jobs:
           exit 1
       - name: Test restful server - pytorch Qwen3-VL-30B-A3B-Instruct
         run: |
-          CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-VL-30B-A3B-Instruct --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_Qwen3-VL-30B-A3B-Instruct_start_restful.log 2>&1 &
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/Qwen/Qwen3-VL-30B-A3B-Instruct --tp 2 --backend pytorch --logprobs-mode raw_logprobs --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_Qwen3-VL-30B-A3B-Instruct_start_restful.log 2>&1 &
           echo "restful_pid=$!"
           for i in $(seq 1 180)
           do
@@ -169,7 +169,7 @@ jobs:
           exit 1
       - name: Test restful server - pytorch InternVL3_5-30B-A3B
         run: |
-          CUDA_VISIBLE_DEVICES=5,6 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3_5-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs  --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_InternVL3_5-30B-A3B_start_restful.log 2>&1 &
+          CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/OpenGVLab/InternVL3_5-30B-A3B --tp 2 --backend pytorch --logprobs-mode raw_logprobs  --allow-terminate-by-client > ${{env.SERVER_LOG}}/pytorch_InternVL3_5-30B-A3B_start_restful.log 2>&1 &
           echo "restful_pid=$!"
           for i in $(seq 1 180)
           do
diff --git a/autotest/benchmark/test_apiserver_performance.py b/autotest/benchmark/test_apiserver_performance.py
index 9b4947abfb..76cd8d593c 100644
--- a/autotest/benchmark/test_apiserver_performance.py
+++ b/autotest/benchmark/test_apiserver_performance.py
@@ -119,7 +119,7 @@ def test_pytorch_apiserver_tp16(config, run_config, worker_id):
     },
     'extra_params': {}
 }, {
-    'model': 'Qwen/Qwen3-VL-32B-Instruct',
+    'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct',
     'backend': 'pytorch',
     'communicator': 'nccl',
     'quant_policy': 8,
diff --git a/autotest/benchmark/test_prefixcache_performance.py b/autotest/benchmark/test_prefixcache_performance.py
index fd8f4156be..05d51aaf75 100644
--- a/autotest/benchmark/test_prefixcache_performance.py
+++ b/autotest/benchmark/test_prefixcache_performance.py
@@ -101,7 +101,7 @@ def test_pytorch_prefix_tp16(config, run_config, worker_id):
     },
     'extra_params': {}
 }, {
-    'model': 'Qwen/Qwen3-VL-32B-Instruct',
+    'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct',
     'backend': 'pytorch',
     'communicator': 'nccl',
     'quant_policy': 8,
diff --git a/autotest/benchmark/test_throughput_performance.py b/autotest/benchmark/test_throughput_performance.py
index e5f99c43da..72a98a6c80 100644
--- a/autotest/benchmark/test_throughput_performance.py
+++ b/autotest/benchmark/test_throughput_performance.py
@@ -102,7 +102,7 @@ def test_pytorch_throughput_tp16(config, run_config, worker_id):
     },
     'extra_params': {}
 }, {
-    'model': 'Qwen/Qwen3-VL-32B-Instruct',
+    'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct',
     'backend': 'pytorch',
     'communicator': 'nccl',
     'quant_policy': 8,
diff --git a/autotest/config.yml b/autotest/config.yml
index 80060b6d04..b848f900dd 100644
--- a/autotest/config.yml
+++ b/autotest/config.yml
@@ -14,36 +14,16 @@ config:
     tp:
         meta-llama/Llama-4-Scout-17B-16E-Instruct: 4
         meta-llama/Meta-Llama-3-1-70B-Instruct: 4
-        internlm/Intern-S1: 8
-        internlm/internlm2_5-20b-chat: 2
-        internlm/internlm2_5-20b: 2
-        internlm/internlm2_5-7b-chat-1m: 4
         OpenGVLab/InternVL3-38B: 2
-        OpenGVLab/InternVL2_5-26B: 2
-        OpenGVLab/InternVL2_5-26B-MPO: 2
-        OpenGVLab/InternVL2_5-38B: 4
-        OpenGVLab/InternVL2-40B: 4
         Qwen/Qwen3-235B-A22B: 8
-        Qwen/Qwen3-32B: 2
         Qwen/Qwen3-30B-A3B: 2
-        Qwen/Qwen3-VL-32B-Instruct: 2
+        Qwen/Qwen3-32B: 2
         Qwen/Qwen3-VL-30B-A3B-Instruct: 2
         Qwen/Qwen3-30B-A3B-Base: 2
-        Qwen/Qwen2.5-32B-Instruct: 2
-        Qwen/Qwen2.5-72B-Instruct: 4
         Qwen/Qwen2.5-VL-32B-Instruct: 2
-        deepseek-ai/DeepSeek-V2-Lite-Chat: 2
-        deepseek-ai/DeepSeek-R1-Distill-Qwen-32B: 2
-        deepseek-ai/deepseek-vl-1.3b-chat: 2
-        baichuan-inc/Baichuan2-13B-Chat: 2
         mistralai/Mixtral-8x7B-Instruct-v0.1: 2
-        liuhaotian/llava-v1.5-13b: 2
-        openbmb/MiniCPM-V-2_6: 2
-        google/gemma-2-27b-it: 2
-        OpenGVLab/InternVL2-Llama3-76B-AWQ: 4
-        unsloth/gpt-oss-20b-BF16: 2
-        unsloth/gpt-oss-120b-BF16: 4
         OpenGVLab/InternVL3_5-30B-A3B: 2
+        zai-org/GLM-4.7-Flash: 2
 
 turbomind_chat_model:
     tp:
@@ -53,61 +33,24 @@ turbomind_chat_model:
         - meta-llama/Meta-Llama-3-1-8B-Instruct-AWQ
         - meta-llama/Meta-Llama-3-1-70B-Instruct
         - meta-llama/Meta-Llama-3-8B-Instruct
-        - meta-llama/Llama-2-7b-chat-hf
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
         - internlm/internlm3-8b-instruct
         - internlm/internlm3-8b-instruct-awq
-        - internlm/internlm2_5-7b-chat
-        - internlm/internlm2_5-20b-chat
-        - OpenGVLab/InternVL3_5-30B-A3B
-        - OpenGVLab/InternVL3-2B
         - OpenGVLab/InternVL3-8B
         - OpenGVLab/InternVL3-38B
-        - OpenGVLab/InternVL2_5-26B-MPO
-        - OpenGVLab/InternVL2_5-1B
-        - OpenGVLab/InternVL2_5-8B
-        - OpenGVLab/InternVL2_5-26B
-        - OpenGVLab/InternVL2_5-38B
-        - OpenGVLab/InternVL2-2B
-        - OpenGVLab/InternVL2-40B
-        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
-        - OpenGVLab/InternVL2-Llama3-76B-AWQ
+        - OpenGVLab/InternVL3_5-30B-A3B
         - Qwen/Qwen3-0.6B
         - Qwen/Qwen3-4B
         - Qwen/Qwen3-8B
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-30B-A3B-GPTQ-Int4
         - Qwen/Qwen3-235B-A22B
-        - Qwen/Qwen2.5-0.5B-Instruct
-        - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen2.5-32B-Instruct
-        - Qwen/Qwen2.5-72B-Instruct
-        - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
         - Qwen/Qwen2.5-VL-7B-Instruct
         - Qwen/Qwen2.5-VL-32B-Instruct
-        - Qwen/Qwen2-VL-2B-Instruct
-        - Qwen/Qwen2-VL-7B-Instruct
         - Qwen/Qwen1.5-MoE-A2.7B-Chat
-        - mistralai/Mistral-7B-Instruct-v0.3
-        - mistralai/Mistral-Nemo-Instruct-2407
         - mistralai/Mixtral-8x7B-Instruct-v0.1
-        - lmdeploy/llama2-chat-7b-w4
-        - baichuan-inc/Baichuan2-7B-Chat
-        - 01-ai/Yi-6B-Chat
-        - liuhaotian/llava-v1.5-13b
-        - liuhaotian/llava-v1.6-vicuna-7b
-        - deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-        - deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-        - deepseek-ai/deepseek-vl-1.3b-chat
-        - deepseek-ai/deepseek-coder-1.3b-instruct
-        - deepseek-ai/DeepSeek-V2-Lite-Chat
-        - codellama/CodeLlama-7b-Instruct-hf
         - THUDM/glm-4-9b-chat
-        - THUDM/codegeex4-all-9b
-        - openbmb/MiniCPM-Llama3-V-2_5
-        - openbmb/MiniCPM-V-2_6
-        # - allenai/Molmo-7B-D-0924  This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow`
+        - zai-org/GLM-4.7-Flash
 
 pytorch_chat_model:
     tp:
@@ -117,26 +60,10 @@ pytorch_chat_model:
         - meta-llama/Meta-Llama-3-1-8B-Instruct
         - meta-llama/Meta-Llama-3-1-70B-Instruct
         - meta-llama/Meta-Llama-3-8B-Instruct
-        - meta-llama/Llama-2-7b-chat-hf
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
         - internlm/internlm3-8b-instruct
-        - internlm/internlm2_5-7b-chat
-        - internlm/internlm2_5-20b-chat
-        - OpenGVLab/InternVL3_5-30B-A3B
-        - OpenGVLab/InternVL3-2B
         - OpenGVLab/InternVL3-8B
         - OpenGVLab/InternVL3-38B
-        - OpenGVLab/InternVL2_5-26B-MPO
-        - OpenGVLab/InternVL2_5-1B
-        - OpenGVLab/InternVL2_5-8B
-        - OpenGVLab/InternVL2_5-26B
-        - OpenGVLab/InternVL2_5-38B
-        - OpenGVLab/InternVL2-2B
-        - OpenGVLab/InternVL2-4B
-        - OpenGVLab/InternVL2-40B
-        - OpenGVLab/InternVL2-Llama3-76B-AWQ
-        # - OpenGVLab/Mono-InternVL-2B 'dict' object has no attribute 'image_size'
+        - OpenGVLab/InternVL3_5-30B-A3B
         - Qwen/Qwen3-0.6B
         - Qwen/Qwen3-4B
         - Qwen/Qwen3-8B
@@ -144,99 +71,32 @@ pytorch_chat_model:
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-235B-A22B
         - Qwen/Qwen3-VL-8B-Instruct
-        - Qwen/Qwen3-VL-32B-Instruct
         - Qwen/Qwen3-VL-30B-A3B-Instruct
-        - Qwen/Qwen2.5-0.5B-Instruct
-        - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen2.5-32B-Instruct
-        - Qwen/Qwen2.5-72B-Instruct
-        - Qwen/Qwen1.5-MoE-A2.7B-Chat
-        - Qwen/Qwen2.5-VL-7B-Instruct
-        - Qwen/Qwen2.5-VL-32B-Instruct
-        - Qwen/Qwen2-VL-2B-Instruct
-        - Qwen/Qwen2-VL-7B-Instruct
-        - unsloth/gpt-oss-20b-BF16
-        - unsloth/gpt-oss-120b-BF16
-        - mistralai/Mistral-7B-Instruct-v0.3
-        - mistralai/Mixtral-8x7B-Instruct-v0.1
-        - google/gemma-3-12b-it
-        - google/gemma-2-9b-it
-        - google/gemma-2-27b-it
-        - google/gemma-7b-it
-        - baichuan-inc/Baichuan2-7B-Chat
-        - baichuan-inc/Baichuan2-13B-Chat
-        - 01-ai/Yi-6B-Chat
-        - deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-        - deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-        - deepseek-ai/deepseek-moe-16b-chat
-        - deepseek-ai/deepseek-coder-1.3b-instruct
-        - deepseek-ai/DeepSeek-V2-Lite-Chat
-        - THUDM/chatglm2-6b
         - THUDM/cogvlm2-llama3-chinese-chat-19B
         - THUDM/glm-4v-9b
         - THUDM/glm-4-9b-chat
-        - THUDM/codegeex4-all-9b
-        - openbmb/MiniCPM-V-2_6
-        - microsoft/Phi-4-mini-instruct
-        - microsoft/Phi-3.5-mini-instruct
+        - google/gemma-2-9b-it
+        - google/gemma-2-27b-it
+        - zai-org/GLM-4.7-Flash
         - microsoft/Phi-3.5-vision-instruct
-        - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
 
 turbomind_vl_model:
     tp:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
-        - OpenGVLab/InternVL2_5-26B-MPO
-        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
-        - OpenGVLab/InternVL3_5-30B-A3B
-        - OpenGVLab/InternVL3-2B
         - OpenGVLab/InternVL3-8B
         - OpenGVLab/InternVL3-38B
-        - OpenGVLab/InternVL2_5-1B
-        - OpenGVLab/InternVL2_5-8B
-        - OpenGVLab/InternVL2_5-26B
-        - OpenGVLab/InternVL2_5-38B
-        - OpenGVLab/InternVL2-2B
-        - OpenGVLab/InternVL2-40B
-        - OpenGVLab/InternVL2-Llama3-76B-AWQ
+        - OpenGVLab/InternVL3_5-30B-A3B
         - Qwen/Qwen2.5-VL-7B-Instruct
         - Qwen/Qwen2.5-VL-32B-Instruct
-        - Qwen/Qwen2-VL-2B-Instruct
-        - Qwen/Qwen2-VL-7B-Instruct
-        - liuhaotian/llava-v1.5-13b
-        - liuhaotian/llava-v1.6-vicuna-7b
-        - deepseek-ai/deepseek-vl-1.3b-chat
-        - openbmb/MiniCPM-Llama3-V-2_5
-        - openbmb/MiniCPM-V-2_6
 
 pytorch_vl_model:
     tp:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
-        - OpenGVLab/InternVL2_5-26B-MPO
-        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
-        - OpenGVLab/InternVL3_5-30B-A3B
-        - OpenGVLab/InternVL3-2B
         - OpenGVLab/InternVL3-8B
-        - OpenGVLab/InternVL3-38B
-        - OpenGVLab/InternVL2_5-1B
-        - OpenGVLab/InternVL2_5-8B
-        - OpenGVLab/InternVL2_5-26B
-        - OpenGVLab/InternVL2_5-38B
-        - OpenGVLab/InternVL2-2B
-        - OpenGVLab/InternVL2-4B
-        - OpenGVLab/InternVL2-40B
-        # - OpenGVLab/Mono-InternVL-2B 'dict' object has no attribute 'image_size'
+        - OpenGVLab/InternVL3_5-30B-A3B
         - Qwen/Qwen3-VL-8B-Instruct
-        - Qwen/Qwen3-VL-32B-Instruct
         - Qwen/Qwen3-VL-30B-A3B-Instruct
-        - Qwen/Qwen2-VL-2B-Instruct
-        - Qwen/Qwen2-VL-7B-Instruct
-        - Qwen/Qwen2.5-VL-7B-Instruct
-        - Qwen/Qwen2.5-VL-32B-Instruct
         - THUDM/cogvlm-chat-hf
-        # - THUDM/cogvlm2-llama3-chinese-chat-19B # 'HFChatTemplate' object has no attribute 'eoa'
+        - THUDM/cogvlm2-llama3-chinese-chat-19B
         - THUDM/glm-4v-9b
         - microsoft/Phi-3-vision-128k-instruct
         - microsoft/Phi-3.5-vision-instruct
@@ -245,191 +105,104 @@ turbomind_base_model:
     tp:
         - Qwen/Qwen3-8B-Base
         - Qwen/Qwen3-30B-A3B-Base
-        - internlm/internlm2_5-7b
-        - internlm/internlm2_5-1_8b
-        - internlm/internlm2_5-20b
-        - codellama/CodeLlama-7b-hf
 
 pytorch_base_model:
     tp:
         - Qwen/Qwen3-8B-Base
         - Qwen/Qwen3-30B-A3B-Base
-        - internlm/internlm2_5-7b
-        - internlm/internlm2_5-1_8b
-        - internlm/internlm2_5-20b
-        - bigcode/starcoder2-7b
 
 turbomind_quantization:
     no_awq:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
         - meta-llama/Meta-Llama-3-1-70B-Instruct
-        # - internlm/internlm3-8b-instruct ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py)
+        - internlm/internlm3-8b-instruct # ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py)
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-235B-A22B
         - Qwen/Qwen3-30B-A3B-Base
-        - Qwen/Qwen3-VL-8B-Instruct
-        - Qwen/Qwen3-VL-32B-Instruct
-        - Qwen/Qwen3-VL-30B-A3B-Instruct
         - Qwen/Qwen1.5-MoE-A2.7B-Chat
         - Qwen/Qwen2.5-VL-7B-Instruct
         - Qwen/Qwen2.5-VL-32B-Instruct
-        - Qwen/Qwen2-VL-2B-Instruct
-        - Qwen/Qwen2-VL-7B-Instruct
         - OpenGVLab/InternVL3_5-30B-A3B
-        - mistralai/Mistral-7B-Instruct-v0.3
-        - mistralai/Mistral-Nemo-Instruct-2407
-        - deepseek-ai/deepseek-coder-1.3b-instruct
-        - deepseek-ai/DeepSeek-V2-Lite-Chat
-        - codellama/CodeLlama-7b-Instruct-hf
-        # - allenai/Molmo-7B-D-0924  This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow`
-        - THUDM/codegeex4-all-9b
+        - zai-org/GLM-4.7-Flash
     gptq:
-        - internlm/internlm2_5-7b-chat
+        - empty
     no_kvint4:
         - meta-llama/Llama-3.2-1B-Instruct
-        - internlm/internlm2_5-1_8b
         - OpenGVLab/InternVL3-2B
         - OpenGVLab/InternVL3-8B
-        - OpenGVLab/InternVL2_5-1B
-        - openbmb/MiniCPM-V-2_6
-        - Qwen/Qwen3-8B-Base
         - Qwen/Qwen3-0.6B
         - Qwen/Qwen3-4B
         - Qwen/Qwen3-8B
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-30B-A3B
+        - Qwen/Qwen3-30B-A3B-GPTQ-Int4
         - Qwen/Qwen3-235B-A22B
-        - Qwen/Qwen3-30B-A3B-Base
-        - Qwen/Qwen3-VL-8B-Instruct
-        - Qwen/Qwen3-VL-32B-Instruct
-        - Qwen/Qwen3-VL-30B-A3B-Instruct
-        - Qwen/Qwen2.5-0.5B-Instruct
-        - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen2.5-32B-Instruct
-        - Qwen/Qwen2.5-72B-Instruct
         - Qwen/Qwen2.5-VL-7B-Instruct
         - Qwen/Qwen2.5-VL-32B-Instruct
-        - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
-        - Qwen/Qwen2-VL-2B-Instruct
-        - Qwen/Qwen2-VL-7B-Instruct
         - Qwen/Qwen1.5-MoE-A2.7B-Chat
-        - microsoft/Phi-3.5-mini-instruct
-        # - allenai/Molmo-7B-D-0924  This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow`
-        - deepseek-ai/DeepSeek-V2-Lite-Chat
+        - Qwen/Qwen3-8B-Base
+        - Qwen/Qwen3-30B-A3B-Base
+        - zai-org/GLM-4.7-Flash
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Chat
-        - Qwen/Qwen2.5-7B-Instruct
+        - zai-org/GLM-4.7-Flash
 
 pytorch_quantization:
     awq:
+        - meta-llama/Llama-3.2-3B-Instruct
         - meta-llama/Meta-Llama-3-8B-Instruct
         - meta-llama/Meta-Llama-3-1-8B-Instruct
-        - meta-llama/Llama-2-7b-chat-hf
         # - internlm/internlm3-8b-instruct ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py)
-        - internlm/internlm2_5-7b-chat
-        - internlm/internlm2_5-20b-chat
-        - 01-ai/Yi-6B-Chat
         - Qwen/Qwen3-0.6B
         - Qwen/Qwen3-4B
         - Qwen/Qwen3-8B
-        - Qwen/Qwen3-32B
-        - Qwen/Qwen2.5-7B-Instruct
         - microsoft/Phi-3-mini-4k-instruct
-        # - microsoft/Phi-4-mini-instruct The size of tensor a (5120) must match the size of tensor b (3072) at non-singleton dimension 0
         - THUDM/glm-4v-9b
     w8a8:
-        - meta-llama/Meta-Llama-3-8B-Instruct
         - meta-llama/Llama-3.2-1B-Instruct
-        - meta-llama/Llama-2-7b-chat-hf
+        - meta-llama/Meta-Llama-3-8B-Instruct
+        - meta-llama/Meta-Llama-3-1-8B-Instruct
         # - internlm/internlm3-8b-instruct ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py)
-        - internlm/internlm2_5-7b-chat
-        - internlm/internlm2_5-20b-chat
-        - 01-ai/Yi-6B-Chat
-        - mistralai/Mistral-7B-Instruct-v0.3
-        - Qwen/Qwen2.5-7B-Instruct
         - microsoft/Phi-3-mini-4k-instruct
-        # - microsoft/Phi-4-mini-instruct The size of tensor a (5120) must match the size of tensor b (3072) at non-singleton dimension 0
-        - internlm/internlm2_5-20b
-        - internlm/internlm2_5-7b
-        - meta-llama/Meta-Llama-3-1-8B-Instruct
     no_kvint4:
         - meta-llama/Llama-3.2-1B-Instruct
-        - internlm/internlm2_5-1_8b
         - OpenGVLab/InternVL3-2B
         - OpenGVLab/InternVL3-8B
-        - OpenGVLab/InternVL2-4B
-        - OpenGVLab/InternVL2_5-1B
         - Qwen/Qwen3-8B-Base
+        - Qwen/Qwen3-30B-A3B-Base
         - Qwen/Qwen3-0.6B
         - Qwen/Qwen3-4B
         - Qwen/Qwen3-8B
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-30B-A3B
         - Qwen/Qwen3-235B-A22B
-        - Qwen/Qwen3-30B-A3B-Base
         - Qwen/Qwen3-VL-8B-Instruct
-        - Qwen/Qwen3-VL-32B-Instruct
         - Qwen/Qwen3-VL-30B-A3B-Instruct
-        - Qwen/Qwen2.5-0.5B-Instruct
-        - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen2.5-32B-Instruct
-        - Qwen/Qwen2.5-72B-Instruct
-        - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
-        - Qwen/Qwen1.5-MoE-A2.7B-Chat
-        - Qwen/Qwen2.5-VL-7B-Instruct
-        - Qwen/Qwen2.5-VL-32B-Instruct
-        - Qwen/Qwen2-VL-2B-Instruct
-        - Qwen/Qwen2-VL-7B-Instruct
-        - deepseek-ai/DeepSeek-V2-Lite-Chat
-        - microsoft/Phi-3-mini-4k-instruct
         - microsoft/Phi-3-vision-128k-instruct
         - microsoft/Phi-3.5-vision-instruct
-        - microsoft/Phi-3.5-mini-instruct
-        - openbmb/MiniCPM-V-2_6
-        - unsloth/gpt-oss-20b-BF16
-        - unsloth/gpt-oss-120b-BF16
+        - zai-org/GLM-4.7-Flash
     no_kvint8:
-        - deepseek-ai/DeepSeek-V2-Lite-Chat
+        - zai-org/GLM-4.7-Flash
 
 longtext_benchmark_model:
     - Qwen/Qwen3-8B
-    - Qwen/Qwen3-32B
-    - Qwen/Qwen3-30B-A3B
-    - Qwen/Qwen3-235B-A22B
-
-benchmark_model:
-    - internlm/Intern-S1
-    - internlm/Intern-S1-mini
-    - meta-llama/Llama-2-7b-chat-hf
-    - meta-llama/Meta-Llama-3-1-8B-Instruct
-    - meta-llama/Meta-Llama-3-1-70B-Instruct
-    - internlm/internlm3-8b-instruct
-    - internlm/internlm2_5-7b-chat
-    - internlm/internlm2_5-20b-chat
-    - THUDM/glm-4-9b-chat
-    - Qwen/Qwen3-32B
     - Qwen/Qwen3-30B-A3B
-    - Qwen/Qwen3-235B-A22B
-    - Qwen/Qwen2.5-7B-Instruct
-    - Qwen/Qwen2.5-72B-Instruct
-    - unsloth/gpt-oss-20b-BF16
-    - unsloth/gpt-oss-120b-BF16
 
 evaluate_model:
   - google/gemma-2-9b-it
   - google/gemma-2-27b-it
   - meta-llama/Meta-Llama-3-1-8B-Instruct
-  - Qwen/Qwen2.5-7B-Instruct
-  - Qwen/Qwen2.5-32B-Instruct
   - Qwen/Qwen1.5-MoE-A2.7B-Chat
   - Qwen/Qwen3-30B-A3B
 
+benchmark_model:
+    - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Meta-Llama-3-1-70B-Instruct
+    - internlm/internlm3-8b-instruct
+    - THUDM/glm-4-9b-chat
+    - Qwen/Qwen3-30B-A3B
+
 mllm_evaluate_model:
-  - internlm/Intern-S1-mini
   - OpenGVLab/InternVL3-8B
   - Qwen/Qwen3-VL-8B-Instruct
-  - Qwen/Qwen3-VL-32B-Instruct
   - Qwen/Qwen3-VL-30B-A3B-Instruct
-  - internlm/Intern-S1
   - OpenGVLab/InternVL3_5-30B-A3B
diff --git a/autotest/config_3090.yml b/autotest/config_3090.yml
index 20823f38e3..a393f95268 100644
--- a/autotest/config_3090.yml
+++ b/autotest/config_3090.yml
@@ -38,8 +38,6 @@ pytorch_chat_model:
         - Qwen/Qwen3-1.7B
         - Qwen/Qwen3-0.6B
         - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen2.5-VL-3B-Instruct
-        - Qwen/Qwen2.5-VL-7B-Instruct
 
 turbomind_vl_model:
     tp:
@@ -54,8 +52,6 @@ pytorch_vl_model:
         - OpenGVLab/InternVL3-2B-Instruct
         - OpenGVLab/InternVL3-1B-Instruct
         - OpenGVLab/InternVL2_5-1B
-        - Qwen/Qwen2.5-VL-3B-Instruct
-        - Qwen/Qwen2.5-VL-7B-Instruct
 
 turbomind_base_model:
     tp:
@@ -111,7 +107,5 @@ pytorch_quantization:
         - Qwen/Qwen3-1.7B
         - Qwen/Qwen3-0.6B
         - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen2.5-VL-3B-Instruct
-        - Qwen/Qwen2.5-VL-7B-Instruct
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/config_3090_legacy.yml b/autotest/config_3090_legacy.yml
new file mode 100644
index 0000000000..20823f38e3
--- /dev/null
+++ b/autotest/config_3090_legacy.yml
@@ -0,0 +1,117 @@
+model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
+log_path: /nvme/qa_test_models/autotest_log
+server_log_path: /nvme/qa_test_models/server_log
+eval_path: /nvme/qa_test_models/evaluation_report
+mllm_eval_path: /nvme/qa_test_models/mllm_evaluation_report
+benchmark_path: /nvme/qa_test_models/benchmark_report
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+env_tag: 3090
+device: cuda
+
+turbomind_chat_model:
+    tp:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Llama-3.2-1B-Instruct
+        - internlm/internlm3-8b-instruct
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen2.5-7B-Instruct
+
+pytorch_chat_model:
+    tp:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Llama-3.2-1B-Instruct
+        - internlm/internlm3-8b-instruct
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-3B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+
+turbomind_vl_model:
+    tp:
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+
+pytorch_vl_model:
+    tp:
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen2.5-VL-3B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+
+turbomind_base_model:
+    tp:
+        - internlm/internlm3-8b-instruct
+        - Qwen/Qwen3-8B
+
+pytorch_base_model:
+    tp:
+        - internlm/internlm3-8b-instruct
+        - Qwen/Qwen3-8B
+
+turbomind_quantization:
+    no_awq:
+        - internlm/internlm3-8b-instruct
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-2B-Instruct
+    gptq:
+        - empty
+    no_kvint4:
+        - meta-llama/Llama-3.2-1B-Instruct
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-3B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Chat
+
+pytorch_quantization:
+    awq:
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen2.5-7B-Instruct
+    w8a8:
+        - meta-llama/Llama-3.2-3B-Instruct
+    no_kvint4:
+        - OpenGVLab/InternVL3-8B
+        - meta-llama/Llama-3.2-1B-Instruct
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-8B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-3B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/config_5080.yml b/autotest/config_5080.yml
index a0858c021a..9c3c459cba 100644
--- a/autotest/config_5080.yml
+++ b/autotest/config_5080.yml
@@ -30,7 +30,6 @@ pytorch_chat_model:
         - Qwen/Qwen3-4B
         - Qwen/Qwen3-1.7B
         - Qwen/Qwen3-0.6B
-        - Qwen/Qwen2.5-VL-3B-Instruct
 
 turbomind_vl_model:
     tp:
@@ -43,7 +42,6 @@ pytorch_vl_model:
         - OpenGVLab/InternVL3-2B-Instruct
         - OpenGVLab/InternVL3-1B-Instruct
         - OpenGVLab/InternVL2_5-1B
-        - Qwen/Qwen2.5-VL-3B-Instruct
 
 turbomind_base_model:
     tp:
@@ -87,6 +85,5 @@ pytorch_quantization:
         - Qwen/Qwen3-4B
         - Qwen/Qwen3-1.7B
         - Qwen/Qwen3-0.6B
-        - Qwen/Qwen2.5-VL-3B-Instruct
     no_kvint8:
         - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/config_5080_legacy.yml b/autotest/config_5080_legacy.yml
new file mode 100644
index 0000000000..9d700e4240
--- /dev/null
+++ b/autotest/config_5080_legacy.yml
@@ -0,0 +1,91 @@
+model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
+log_path: /nvme/qa_test_models/autotest_log
+server_log_path: /nvme/qa_test_models/server_log
+eval_path: /nvme/qa_test_models/evaluation_report
+mllm_eval_path: /nvme/qa_test_models/mllm_evaluation_report
+benchmark_path: /nvme/qa_test_models/benchmark_report
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+env_tag: 5080
+device: cuda
+
+turbomind_chat_model:
+    tp:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Llama-3.2-1B-Instruct
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+
+pytorch_chat_model:
+    tp:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - meta-llama/Llama-3.2-1B-Instruct
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+
+turbomind_vl_model:
+    tp:
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+
+pytorch_vl_model:
+    tp:
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen2.5-VL-3B-Instruct
+
+turbomind_base_model:
+    tp:
+        - Qwen/Qwen3-4B
+
+pytorch_base_model:
+    tp:
+        - Qwen/Qwen3-4B
+
+turbomind_quantization:
+    no_awq:
+        - OpenGVLab/InternVL3-2B-Instruct
+    gptq:
+        - empty
+    no_kvint4:
+        - meta-llama/Llama-3.2-1B-Instruct
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen2.5-VL-3B-Instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Chat
+
+pytorch_quantization:
+    awq:
+        - meta-llama/Llama-3.2-3B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+    w8a8:
+        - meta-llama/Llama-3.2-3B-Instruct
+    no_kvint4:
+        - meta-llama/Llama-3.2-1B-Instruct
+        - OpenGVLab/InternVL3-2B-Instruct
+        - OpenGVLab/InternVL3-1B-Instruct
+        - OpenGVLab/InternVL2_5-1B
+        - Qwen/Qwen3-4B
+        - Qwen/Qwen3-1.7B
+        - Qwen/Qwen3-0.6B
+        - Qwen/Qwen2.5-VL-3B-Instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
diff --git a/autotest/config_h.yml b/autotest/config_h.yml
index 0bf5710beb..667033f36c 100644
--- a/autotest/config_h.yml
+++ b/autotest/config_h.yml
@@ -12,6 +12,7 @@ device: cuda
 
 config:
     tp:
+        Qwen/Qwen3-235B-A22B-FP8: 4
         internlm/Intern-S1: 4
         Qwen/Qwen3-235B-A22B-Thinking-2507-FP8: 4
         Qwen/Qwen3-30B-A3B: 2
@@ -24,6 +25,8 @@ config:
         JetLM/SDAR-30B-A3B-Sci: 2
         moonshotai/Kimi-K2-Instruct-0905: 16
         Qwen/Qwen3-235B-A22B-Thinking-2507: 8
+        OpenGVLab/InternVL3_5-38B: 2
+        Qwen/Qwen3-VL-30B-A3B-Instruct: 2
         internlm/Intern-S1-Pro-FP8: 16
 
     dp_ep:
@@ -45,8 +48,6 @@ config:
 
 turbomind_chat_model:
     tp:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
         - Qwen/Qwen3-0.6B-FP8
         - Qwen/Qwen3-1.7B-FP8
         - Qwen/Qwen3-4B-FP8
@@ -58,6 +59,7 @@ turbomind_chat_model:
         - Qwen/Qwen3-30B-A3B-FP8
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-32B-FP8
+        - OpenGVLab/InternVL3_5-38B
         - openai/gpt-oss-120b
         - openai/gpt-oss-20b
 
@@ -66,8 +68,6 @@ turbomind_chat_model:
 
 pytorch_chat_model:
     tp:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
         - Qwen/Qwen3-0.6B-FP8
         - Qwen/Qwen3-1.7B-FP8
         - Qwen/Qwen3-4B-FP8
@@ -79,6 +79,8 @@ pytorch_chat_model:
         - Qwen/Qwen3-30B-A3B-FP8
         - Qwen/Qwen3-32B
         - Qwen/Qwen3-32B-FP8
+        - Qwen/Qwen3-VL-30B-A3B-Instruct
+        - OpenGVLab/InternVL3_5-38B
         - unsloth/gpt-oss-120b-BF16
         - unsloth/gpt-oss-20b-BF16
         - deepseek/DeepSeek-V3.1
@@ -92,17 +94,16 @@ pytorch_chat_model:
 
 turbomind_vl_model:
     tp:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
+        - OpenGVLab/InternVL3_5-38B
+
 
 pytorch_vl_model:
     tp:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
+        - OpenGVLab/InternVL3_5-38B
+        - Qwen/Qwen3-VL-30B-A3B-Instruct
 
 turbomind_base_model:
     tp:
-        - internlm/Intern-S1-mini
         - Qwen/Qwen3-4B-FP8
         - openai/gpt-oss-20b
 
@@ -113,8 +114,6 @@ pytorch_base_model:
 
 turbomind_quantization:
     no_awq:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
         - Qwen/Qwen3-0.6B-FP8
         - Qwen/Qwen3-1.7B-FP8
         - Qwen/Qwen3-4B-FP8
@@ -131,8 +130,6 @@ turbomind_quantization:
     gptq:
         - empty
     no_kvint4:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
         - Qwen/Qwen3-0.6B-FP8
         - Qwen/Qwen3-1.7B-FP8
         - Qwen/Qwen3-4B-FP8
@@ -155,8 +152,6 @@ pytorch_quantization:
     w8a8:
         - empty
     no_kvint4:
-        - internlm/Intern-S1
-        - internlm/Intern-S1-mini
         - Qwen/Qwen3-8B-Base
         - Qwen/Qwen3-0.6B-FP8
         - Qwen/Qwen3-1.7B-FP8
@@ -183,8 +178,6 @@ longtext_model:
     - Qwen/Qwen3-235B-A22B-Thinking-2507
 
 benchmark_model:
-    - internlm/Intern-S1
-    - internlm/Intern-S1-mini
     - meta-llama/Meta-Llama-3-1-8B-Instruct
     - meta-llama/Meta-Llama-3-1-70B-Instruct
     - Qwen/Qwen3-32B
@@ -211,5 +204,5 @@ evaluate_model:
     - JetLM/SDAR-30B-A3B-Sci
 
 mllm_evaluate_model:
-    - internlm/Intern-S1
-    - internlm/Intern-S1-mini
+    - OpenGVLab/InternVL3_5-38B
+    - Qwen/Qwen3-VL-30B-A3B-Instruct
diff --git a/autotest/config_h_legacy.yml b/autotest/config_h_legacy.yml
new file mode 100644
index 0000000000..02c9f9fcc6
--- /dev/null
+++ b/autotest/config_h_legacy.yml
@@ -0,0 +1,72 @@
+model_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/model
+resource_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/resource
+log_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/log
+server_log_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/server_log
+eval_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/evaluation_report
+mllm_eval_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/mllm_evaluation_report
+benchmark_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/cicd-autotest/eval_resource/benchmark_report
+dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+prefix_dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/prefix_cache_test.json
+env_tag: h
+device: cuda
+
+config:
+    tp:
+        internlm/Intern-S1: 4
+
+turbomind_chat_model:
+    tp:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+
+pytorch_chat_model:
+    tp:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+
+turbomind_vl_model:
+    tp:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+
+pytorch_vl_model:
+    tp:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+
+turbomind_base_model:
+    tp:
+
+pytorch_base_model:
+    tp:
+
+turbomind_quantization:
+    no_awq:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+    gptq:
+        - empty
+    no_kvint4:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+    no_kvint8:
+        - empty
+
+pytorch_quantization:
+    awq:
+        - empty
+    w8a8:
+        - empty
+    no_kvint4:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+    no_kvint8:
+        - empty
+
+benchmark_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
+
+mllm_evaluate_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
diff --git a/autotest/config_legacy.yml b/autotest/config_legacy.yml
new file mode 100644
index 0000000000..74abfcd690
--- /dev/null
+++ b/autotest/config_legacy.yml
@@ -0,0 +1,162 @@
+model_path: /nvme/qa_test_models
+resource_path: /nvme/qa_test_models/resource
+log_path: /nvme/qa_test_models/autotest_log
+server_log_path: /nvme/qa_test_models/server_log
+eval_path: /nvme/qa_test_models/evaluation_report
+mllm_eval_path: /nvme/qa_test_models/mllm_evaluation_report
+benchmark_path: /nvme/qa_test_models/benchmark_report
+dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
+prefix_dataset_path: /nvme/qa_test_models/datasets/prefix_cache_test.json
+env_tag: a100
+device: cuda
+
+config:
+    tp:
+        meta-llama/Llama-4-Scout-17B-16E-Instruct: 4
+        meta-llama/Meta-Llama-3-1-70B-Instruct: 4
+        internlm/Intern-S1: 8
+        OpenGVLab/InternVL3-38B: 2
+        OpenGVLab/InternVL2_5-26B: 2
+        OpenGVLab/InternVL2_5-26B-MPO: 2
+        OpenGVLab/InternVL2_5-38B: 4
+        OpenGVLab/InternVL2-40B: 4
+        Qwen/Qwen2.5-72B-Instruct: 4
+        deepseek-ai/deepseek-vl-1.3b-chat: 2
+        baichuan-inc/Baichuan2-13B-Chat: 2
+        mistralai/Mixtral-8x7B-Instruct-v0.1: 2
+        google/gemma-2-27b-it: 2
+        OpenGVLab/InternVL2-Llama3-76B-AWQ: 4
+        unsloth/gpt-oss-20b-BF16: 2
+        unsloth/gpt-oss-120b-BF16: 4
+        OpenGVLab/InternVL3_5-30B-A3B: 2
+
+turbomind_chat_model:
+    tp:
+        - meta-llama/Llama-2-7b-chat-hf
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL2_5-8B
+        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+        - OpenGVLab/InternVL2-Llama3-76B-AWQ
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
+        - Qwen/Qwen2.5-VL-7B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
+        - baichuan-inc/Baichuan2-7B-Chat
+        - liuhaotian/llava-v1.6-vicuna-7b
+        - codellama/CodeLlama-7b-Instruct-hf
+        # - allenai/Molmo-7B-D-0924  This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow`
+
+pytorch_chat_model:
+    tp:
+        - meta-llama/Llama-2-7b-chat-hf
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL2_5-8B
+        # - OpenGVLab/Mono-InternVL-2B 'dict' object has no attribute 'image_size'
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
+        - unsloth/gpt-oss-20b-BF16
+        - mistralai/Mixtral-8x7B-Instruct-v0.1
+        - google/gemma-3-12b-it
+        - google/gemma-2-9b-it
+        - google/gemma-2-27b-it
+        - google/gemma-7b-it
+        - baichuan-inc/Baichuan2-13B-Chat
+        - deepseek-ai/deepseek-moe-16b-chat
+        - THUDM/chatglm2-6b
+        - microsoft/Phi-4-mini-instruct
+
+turbomind_vl_model:
+    tp:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL2_5-8B
+        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+        - OpenGVLab/InternVL2-Llama3-76B-AWQ
+        - Qwen/Qwen2.5-VL-7B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
+        - liuhaotian/llava-v1.6-vicuna-7b
+
+pytorch_vl_model:
+    tp:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - OpenGVLab/InternVL3-8B
+        - OpenGVLab/InternVL2_5-8B
+        # - OpenGVLab/Mono-InternVL-2B 'dict' object has no attribute 'image_size'
+        - Qwen/Qwen2-VL-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+
+turbomind_base_model:
+    tp:
+        - codellama/CodeLlama-7b-hf
+
+pytorch_base_model:
+    tp:
+        - bigcode/starcoder2-7b
+
+turbomind_quantization:
+    no_awq:
+        - internlm/Intern-S1
+        - internlm/Intern-S1-mini
+        - OpenGVLab/InternVL3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
+        - Qwen/Qwen2.5-VL-7B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
+        - OpenGVLab/InternVL3_5-30B-A3B
+        - codellama/CodeLlama-7b-Instruct-hf
+        # - allenai/Molmo-7B-D-0924  This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow`
+    gptq:
+        - empty
+    no_kvint4:
+        - OpenGVLab/InternVL3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
+        # - allenai/Molmo-7B-D-0924  This modeling file requires the following packages that were not found in your environment: tensorflow. Run `pip install tensorflow`
+    no_kvint8:
+        - Qwen/Qwen2.5-7B-Instruct
+
+pytorch_quantization:
+    awq:
+        - meta-llama/Llama-2-7b-chat-hf
+        # - internlm/internlm3-8b-instruct ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py)
+        - Qwen/Qwen2.5-7B-Instruct
+        # - microsoft/Phi-4-mini-instruct The size of tensor a (5120) must match the size of tensor b (3072) at non-singleton dimension 0
+    w8a8:
+        - meta-llama/Llama-2-7b-chat-hf
+        # - internlm/internlm3-8b-instruct ImportError: cannot import name 'LossKwargs' from 'transformers.utils' (/opt/py3/lib/python3.10/site-packages/transformers/utils/__init__.py)
+        - Qwen/Qwen2.5-7B-Instruct
+        # - microsoft/Phi-4-mini-instruct The size of tensor a (5120) must match the size of tensor b (3072) at non-singleton dimension 0
+    no_kvint4:
+        - OpenGVLab/InternVL3-8B
+        - Qwen/Qwen2.5-7B-Instruct
+        - Qwen/Qwen2.5-VL-7B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
+        - microsoft/Phi-3-vision-128k-instruct
+        - microsoft/Phi-3.5-vision-instruct
+        - unsloth/gpt-oss-20b-BF16
+    no_kvint8:
+        - empty
+
+longtext_benchmark_model:
+    - internlm/Intern-S1-mini
+
+benchmark_model:
+    - internlm/Intern-S1
+    - internlm/Intern-S1-mini
+    - meta-llama/Llama-2-7b-chat-hf
+    - unsloth/gpt-oss-20b-BF16
+
+evaluate_model:
+  - Qwen/Qwen2.5-7B-Instruct
+
+mllm_evaluate_model:
+  - internlm/Intern-S1-mini
+  - internlm/Intern-S1
diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index 7ea918415d..2b544634ce 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -353,7 +353,7 @@ def run_pipeline_testcase_ignore_eos(config, model, backend, file_name):
     pipe.close()
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_prompt(config, model, backend, worker_id):
 
@@ -362,7 +362,7 @@ def test_return_with_prompt(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_prompt_stream(config, model, backend, worker_id):
 
@@ -371,7 +371,7 @@ def test_return_with_prompt_stream(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_multi_prompt(config, model, backend, worker_id):
 
@@ -380,7 +380,7 @@ def test_return_with_multi_prompt(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_multi_prompt_stream(config, model, backend, worker_id):
 
@@ -389,7 +389,7 @@ def test_return_with_multi_prompt_stream(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_message(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -397,7 +397,7 @@ def test_return_with_message(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_message_stream(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -405,7 +405,7 @@ def test_return_with_message_stream(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_message_batch(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -413,7 +413,7 @@ def test_return_with_message_batch(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_return_with_message_batch_stream(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -421,7 +421,7 @@ def test_return_with_message_batch_stream(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig])
 def test_return_check_logprobs(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -429,7 +429,7 @@ def test_return_check_logprobs(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig])
 def test_return_check_logprobs_stream(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -437,7 +437,7 @@ def test_return_check_logprobs_stream(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_backend_config_session_len(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -445,7 +445,7 @@ def test_backend_config_session_len(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_min_new_tokens(config, model, backend, worker_id):
     file_name = f'pipeline_log_min_new_tokens_{worker_id}.txt'
@@ -453,7 +453,7 @@ def test_gen_config_min_new_tokens(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_stop_words(config, model, backend, worker_id):
     file_name = f'pipeline_log_stop_words_{worker_id}.txt'
@@ -461,7 +461,7 @@ def test_gen_config_stop_words(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_bad_words(config, model, backend, worker_id):
     file_name = f'pipeline_log_bad_words_{worker_id}.txt'
@@ -469,7 +469,7 @@ def test_gen_config_bad_words(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_special_words_false(config, model, backend, worker_id):
     file_name = f'pipeline_log_special_words_{worker_id}.txt'
@@ -477,7 +477,7 @@ def test_gen_config_special_words_false(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_special_words_true(config, model, backend, worker_id):
     file_name = f'pipeline_log_special_words_{worker_id}.txt'
@@ -485,7 +485,7 @@ def test_gen_config_special_words_true(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_minimum_repetition_penalty(config, model, backend, worker_id):
     file_name = f'pipeline_log_repetition_penalty_{worker_id}.txt'
@@ -493,7 +493,7 @@ def test_gen_config_minimum_repetition_penalty(config, model, backend, worker_id
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_repetition_penalty_bigger_than_1(config, model, backend, worker_id):
     file_name = f'pipeline_log_repetition_penalty_{worker_id}.txt'
@@ -501,7 +501,7 @@ def test_gen_config_repetition_penalty_bigger_than_1(config, model, backend, wor
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_minimun_topp(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -509,7 +509,7 @@ def test_gen_config_minimun_topp(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_minimun_topk(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -517,7 +517,7 @@ def test_gen_config_minimun_topk(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_diff_random_seed(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -525,7 +525,7 @@ def test_gen_config_diff_random_seed(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_same_random_seed(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -533,7 +533,7 @@ def test_gen_config_same_random_seed(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_do_sample_batch(config, model, backend, worker_id):
     file_name = f'pipeline_log_{worker_id}.txt'
@@ -541,7 +541,7 @@ def test_gen_config_do_sample_batch(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_max_new_tokens(config, model, backend, worker_id):
     file_name = f'pipeline_log_max_new_tokens_{worker_id}.txt'
@@ -549,7 +549,7 @@ def test_gen_config_max_new_tokens(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_gen_config_ignore_eos(config, model, backend, worker_id):
     file_name = f'pipeline_log_ignore_eos_{worker_id}.txt'
@@ -557,7 +557,7 @@ def test_gen_config_ignore_eos(config, model, backend, worker_id):
     assert_pipeline_common_log(config, file_name)
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig, PytorchEngineConfig])
 def test_backend_config_input_validation(config, model, backend, worker_id):
     if 'gw' in worker_id:
@@ -594,7 +594,7 @@ def test_backend_config_input_validation(config, model, backend, worker_id):
         unset_device_env_variable()
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig])
 def test_backend_config_validate_turbomind(config, model, backend, worker_id):
     if 'gw' in worker_id:
@@ -632,7 +632,7 @@ def test_backend_config_validate_turbomind(config, model, backend, worker_id):
         unset_device_env_variable()
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat', 'OpenGVLab/InternVL2_5-26B'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B', 'Qwen/Qwen3-30B-A3B'])
 @pytest.mark.parametrize('backend', [PytorchEngineConfig])
 def test_backend_config_validate_pytorch(config, model, backend, worker_id):
     if 'gw' in worker_id:
@@ -662,7 +662,7 @@ def test_backend_config_validate_pytorch(config, model, backend, worker_id):
         unset_device_env_variable()
 
 
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
+@pytest.mark.parametrize('model', ['OpenGVLab/InternVL3_5-30B-A3B'])
 @pytest.mark.parametrize('backend', [TurbomindEngineConfig])
 def test_backend_config_tp(config, model, backend, worker_id):
     with pytest.raises(AssertionError):
diff --git a/autotest/interface/pipeline/test_pipeline_longtext_func.py b/autotest/interface/pipeline/test_pipeline_longtext_func.py
index a87c036814..45850ae1e9 100644
--- a/autotest/interface/pipeline/test_pipeline_longtext_func.py
+++ b/autotest/interface/pipeline/test_pipeline_longtext_func.py
@@ -13,14 +13,10 @@
 
 SESSION_LEN_CONFIG = {
     'Qwen/Qwen2.5-7B-Instruct': SESSION_LEN_32K,
-    'Qwen/Qwen2.5-32B-Instruct': SESSION_LEN_32K,
-    'Qwen/Qwen2.5-72B-Instruct': SESSION_LEN_32K,
     'Qwen/Qwen3-235B-A22B': SESSION_LEN_128K,
     'Qwen/Qwen3-30B-A3B': SESSION_LEN_128K,
     'Qwen/Qwen3-32B': SESSION_LEN_128K,
     'meta-llama/Meta-Llama-3-1-8B-Instruct': SESSION_LEN_128K,
-    'internlm/Intern-S1-mini': SESSION_LEN_128K,
-    'internlm/Intern-S1': SESSION_LEN_128K,
     'meta-llama/Meta-Llama-3-1-70B-Instruct': SESSION_LEN_128K,
 }
 
@@ -33,8 +29,7 @@ def run_case_in_spawn(target, args):
 
 
 @pytest.mark.gpu_num_1
-@pytest.mark.parametrize(
-    'model', ['internlm/Intern-S1-mini', 'internlm/internlm2_5-7b-chat', 'internlm/internlm2_5-7b-chat-inner-4bits'])
+@pytest.mark.parametrize('model', ['Qwen/Qwen3-8B'])
 def test_history_issue_tp1(config, model, worker_id):
     if 'gw' in worker_id:
         set_device_env_variable(worker_id)
@@ -77,10 +72,7 @@ def stream_infer_worker(config, model, tp_num):
 
 
 @pytest.mark.gpu_num_1
-@pytest.mark.parametrize('model', [
-    'internlm/Intern-S1-mini', 'internlm/internlm2_5-7b-chat', 'internlm/internlm2_5-7b-chat-inner-4bits',
-    'Qwen/Qwen2.5-7B-Instruct', 'meta-llama/Meta-Llama-3-1-8B-Instruct'
-])
+@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-7B-Instruct', 'meta-llama/Meta-Llama-3-1-8B-Instruct'])
 @pytest.mark.parametrize('backend', ['turbomind', 'pytorch'])
 def test_long_test_passkey_tp1(config, model, backend, worker_id):
     log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log'])
@@ -93,7 +85,7 @@ def test_long_test_passkey_tp1(config, model, backend, worker_id):
 
 
 @pytest.mark.gpu_num_2
-@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-32B-Instruct', 'Qwen/Qwen3-30B-A3B', 'Qwen/Qwen3-32B'])
+@pytest.mark.parametrize('model', ['Qwen/Qwen3-30B-A3B', 'Qwen/Qwen3-32B'])
 @pytest.mark.parametrize('backend', ['turbomind', 'pytorch'])
 def test_long_test_passkey_tp2(config, model, backend, worker_id):
     log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log'])
@@ -106,23 +98,8 @@ def test_long_test_passkey_tp2(config, model, backend, worker_id):
         unset_device_env_variable()
 
 
-@pytest.mark.gpu_num_4
-@pytest.mark.parametrize('model', ['Qwen/Qwen2.5-72B-Instruct'])
-@pytest.mark.parametrize('backend', ['turbomind', 'pytorch'])
-def test_long_test_passkey_tp4(config, model, backend, worker_id):
-    log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log'])
-    if 'gw' in worker_id:
-        set_device_env_variable(worker_id, parallel_config=4)
-        os.environ['MASTER_PORT'] = str(int(worker_id.replace('gw', '')) + 29500)
-    run_case_in_spawn(passkey_retrival_worker,
-                      (config, model, backend, log_name, 4, SESSION_LEN_CONFIG.get(model, SESSION_LEN_128K)))
-    if 'gw' in worker_id:
-        unset_device_env_variable()
-
-
 @pytest.mark.gpu_num_8
-@pytest.mark.parametrize('model',
-                         ['Qwen/Qwen3-235B-A22B', 'internlm/Intern-S1', 'meta-llama/Meta-Llama-3-1-70B-Instruct'])
+@pytest.mark.parametrize('model', ['Qwen/Qwen3-235B-A22B', 'meta-llama/Meta-Llama-3-1-70B-Instruct'])
 @pytest.mark.parametrize('backend', ['turbomind', 'pytorch'])
 def test_long_test_passkey_tp8(config, model, backend, worker_id):
     log_name = ''.join(['pipeline_longtext_passkey_', worker_id, '.log'])
diff --git a/autotest/interface/restful/test_restful_generate.py b/autotest/interface/restful/test_restful_generate.py
index cf4c9a463e..e08b5c3a92 100644
--- a/autotest/interface/restful/test_restful_generate.py
+++ b/autotest/interface/restful/test_restful_generate.py
@@ -4,7 +4,7 @@
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
-from typing import Any, Dict, List
+from typing import Any
 
 import pytest
 import requests
@@ -115,8 +115,8 @@ def status_code(self):
             return resp
 
     def _validate_generation_response(self,
-                                      data: Dict[str, Any],
-                                      expected_fields: List[str] = None,
+                                      data: dict[str, Any],
+                                      expected_fields: list[str] | None = None,
                                       validate_tokens: bool = True,
                                       expect_logprobs: bool = False,
                                       validate_experts: bool = False) -> None:
diff --git a/autotest/tools/common_case_config.py b/autotest/tools/common_case_config.py
index 016bbf5e61..12334e8815 100644
--- a/autotest/tools/common_case_config.py
+++ b/autotest/tools/common_case_config.py
@@ -1,5 +1,5 @@
 TURBOMIND_PR_TEST_LLM_GPU2 = [{
-    'model': 'internlm/internlm2_5-20b-chat',
+    'model': 'Qwen/Qwen3-30B-A3B',
     'backend': 'turbomind',
     'communicator': 'cuda-ipc',
     'quant_policy': 0,
@@ -7,15 +7,6 @@
         'tp': 2
     },
     'extra_params': {}
-}, {
-    'model': 'internlm/internlm2_5-20b-chat-inner-4bits',
-    'backend': 'turbomind',
-    'communicator': 'nccl',
-    'quant_policy': 8,
-    'parallel_config': {
-        'tp': 2
-    },
-    'extra_params': {}
 }, {
     'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
     'backend': 'turbomind',
@@ -28,7 +19,7 @@
 }]
 
 TURBOMIND_PR_TEST_LLM_GPU1 = [{
-    'model': 'OpenGVLab/InternVL3-8B',
+    'model': 'Qwen/Qwen3-0.6B',
     'backend': 'turbomind',
     'communicator': 'cuda-ipc',
     'quant_policy': 0,
@@ -37,7 +28,16 @@
     },
     'extra_params': {}
 }, {
-    'model': 'OpenGVLab/InternVL3-8B',
+    'model': 'Qwen/Qwen3-0.6B-inner-4bits',
+    'backend': 'turbomind',
+    'communicator': 'nccl',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 1
+    },
+    'extra_params': {}
+}, {
+    'model': 'Qwen/Qwen3-8B',
     'backend': 'turbomind',
     'communicator': 'nccl',
     'quant_policy': 8,
@@ -48,7 +48,7 @@
 }]
 
 TURBOMIND_PR_TEST_MLLM_GPU1 = [{
-    'model': 'liuhaotian/llava-v1.6-vicuna-7b',
+    'model': 'OpenGVLab/InternVL3-8B',
     'backend': 'turbomind',
     'communicator': 'cuda-ipc',
     'quant_policy': 0,
@@ -57,7 +57,7 @@
     },
     'extra_params': {}
 }, {
-    'model': 'OpenGVLab/InternVL2-4B',
+    'model': 'OpenGVLab/InternVL3-8B',
     'backend': 'turbomind',
     'communicator': 'nccl',
     'quant_policy': 8,
@@ -65,19 +65,30 @@
         'tp': 1
     },
     'extra_params': {}
+}]
+
+TURBOMIND_PR_TEST_MLLM_GPU2 = [{
+    'model': 'OpenGVLab/InternVL3_5-30B-A3B',
+    'backend': 'turbomind',
+    'communicator': 'cuda-ipc',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 2
+    },
+    'extra_params': {}
 }, {
-    'model': 'OpenGVLab/InternVL3-8B',
+    'model': 'OpenGVLab/InternVL3_5-30B-A3B',
     'backend': 'turbomind',
     'communicator': 'nccl',
     'quant_policy': 8,
     'parallel_config': {
-        'tp': 1
+        'tp': 2
     },
     'extra_params': {}
 }]
 
 TURBOMIND_FALLBACK_TEST_LLM_GPU1 = [{
-    'model': 'microsoft/Phi-4-mini-instruct',
+    'model': 'THUDM/cogvlm-chat-hf',
     'backend': 'turbomind',
     'communicator': 'cuda-ipc',
     'quant_policy': 8,
@@ -85,10 +96,19 @@
         'tp': 1
     },
     'extra_params': {}
+}, {
+    'model': 'microsoft/Phi-3.5-vision-instruct',
+    'backend': 'turbomind',
+    'communicator': 'nccl',
+    'quant_policy': 0,
+    'parallel_config': {
+        'tp': 1
+    },
+    'extra_params': {}
 }]
 
 TURBOMIND_FALLBACK_TEST_LLM_GPU2 = [{
-    'model': 'google/gemma-2-27b-it',
+    'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct',
     'backend': 'turbomind',
     'communicator': 'cuda-ipc',
     'quant_policy': 0,
@@ -97,7 +117,7 @@
     },
     'extra_params': {}
 }, {
-    'model': 'deepseek-ai/deepseek-moe-16b-chat',
+    'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct',
     'backend': 'turbomind',
     'communicator': 'nccl',
     'quant_policy': 8,
@@ -108,15 +128,6 @@
 }]
 
 TURBOMIND_FALLBACK_TEST_MLLM_GPU1 = [{
-    'model': 'microsoft/Phi-4-mini-instruct',
-    'backend': 'turbomind',
-    'communicator': 'cuda-ipc',
-    'quant_policy': 8,
-    'parallel_config': {
-        'tp': 1
-    },
-    'extra_params': {}
-}, {
     'model': 'THUDM/glm-4v-9b',
     'backend': 'turbomind',
     'communicator': 'cuda-ipc',
@@ -126,16 +137,7 @@
     },
     'extra_params': {}
 }, {
-    'model': 'THUDM/glm-4v-9b-inner-4bits',
-    'backend': 'turbomind',
-    'communicator': 'nccl',
-    'quant_policy': 0,
-    'parallel_config': {
-        'tp': 1
-    },
-    'extra_params': {}
-}, {
-    'model': 'OpenGVLab/InternVL2-4B',
+    'model': 'THUDM/glm-4v-9b',
     'backend': 'turbomind',
     'communicator': 'nccl',
     'quant_policy': 0,
@@ -146,7 +148,7 @@
 }]
 
 TURBOMIND_LOGPROBS_TEST_LLM_GPU2 = [{
-    'model': 'internlm/internlm2_5-20b-chat',
+    'model': 'Qwen/Qwen3-30B-A3B',
     'backend': 'turbomind',
     'communicator': 'nccl',
     'quant_policy': 0,
@@ -227,7 +229,7 @@
 }]
 
 PYTORCH_PR_TEST_LLM_GPU2 = [{
-    'model': 'internlm/internlm2_5-20b-chat',
+    'model': 'Qwen/Qwen3-30B-A3B',
     'backend': 'pytorch',
     'communicator': 'nccl',
     'quant_policy': 8,
@@ -256,7 +258,7 @@
     },
     'extra_params': {}
 }, {
-    'model': 'OpenGVLab/InternVL3-8B',
+    'model': 'Qwen/Qwen3-0.6B',
     'backend': 'pytorch',
     'communicator': 'nccl',
     'quant_policy': 8,
@@ -267,17 +269,7 @@
 }]
 
 BASE_TOOLCALL_TEST_LLM = [{
-    'model': 'internlm/internlm2_5-7b-chat',
-    'communicator': 'nccl',
-    'quant_policy': 0,
-    'parallel_config': {
-        'tp': 1
-    },
-    'extra_params': {
-        'tool-call-parser': 'internlm'
-    }
-}, {
-    'model': 'Qwen/Qwen2.5-7B-Instruct',
+    'model': 'Qwen/Qwen3-8B',
     'communicator': 'nccl',
     'quant_policy': 0,
     'parallel_config': {
@@ -286,16 +278,6 @@
     'extra_params': {
         'tool-call-parser': 'qwen'
     }
-}, {
-    'model': 'internlm/internlm2_5-20b-chat',
-    'communicator': 'nccl',
-    'quant_policy': 0,
-    'parallel_config': {
-        'tp': 2
-    },
-    'extra_params': {
-        'tool-call-parser': 'internlm'
-    }
 }, {
     'model': 'meta-llama/Meta-Llama-3-1-70B-Instruct',
     'communicator': 'nccl',
@@ -307,11 +289,11 @@
         'tool-call-parser': 'llama3'
     }
 }, {
-    'model': 'Qwen/Qwen2.5-72B-Instruct',
+    'model': 'Qwen/Qwen3-30B-A3B',
     'communicator': 'nccl',
     'quant_policy': 0,
     'parallel_config': {
-        'tp': 4
+        'tp': 2
     },
     'extra_params': {
         'tool-call-parser': 'qwen'
@@ -319,24 +301,24 @@
 }]
 
 BASE_REASONING_TEST_LLM = [{
-    'model': 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
+    'model': 'Qwen/Qwen3-VL-30B-A3B-Instruct',
     'communicator': 'nccl',
     'quant_policy': 0,
     'parallel_config': {
         'tp': 1
     },
     'extra_params': {
-        'reasoning-parser': 'deepseek-r1'
+        'reasoning-parser': 'qwen-qwq'
     }
 }, {
-    'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
+    'model': 'Qwen/Qwen3-30B-A3B',
     'communicator': 'nccl',
     'quant_policy': 0,
     'parallel_config': {
         'tp': 2
     },
     'extra_params': {
-        'reasoning-parser': 'deepseek-r1'
+        'reasoning-parser': 'qwen-qwq'
     }
 }]
 
diff --git a/autotest/tools/pipeline/llm_case.py b/autotest/tools/pipeline/llm_case.py
index 3efe84d9e2..13ce7de514 100644
--- a/autotest/tools/pipeline/llm_case.py
+++ b/autotest/tools/pipeline/llm_case.py
@@ -63,7 +63,7 @@ def run_pipeline_chat_test(model_path, run_config, cases_path, is_pr_test: bool
     for case in cases_info.keys():
         if is_pr_test and case != 'memory_test':
             continue
-        if case != 'code_testcases' and 'code' in model_path.lower():
+        if case != 'code_testcase' and 'code' in model_path.lower():
             continue
         case_info = cases_info.get(case)
 
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
index 4676a34341..bb146b8178 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_mllm.py
@@ -1,6 +1,7 @@
 import pytest
-from tools.common_case_config import TURBOMIND_FALLBACK_TEST_MLLM_GPU1, TURBOMIND_PR_TEST_MLLM_GPU1
-from utils.config_utils import get_func_config_list
+from tools.common_case_config import (TURBOMIND_FALLBACK_TEST_MLLM_GPU1, TURBOMIND_PR_TEST_MLLM_GPU1,
+                                      TURBOMIND_PR_TEST_MLLM_GPU2)
+from utils.config_utils import get_func_config_list, get_workerid
 from utils.pipeline_chat import run_pipeline_mllm_test
 
 BACKEND = 'turbomind'
@@ -50,6 +51,17 @@ def test_restful_chat_fallback_backend_tp1(config, run_config, worker_id):
 
 @pytest.mark.gpu_num_1
 @pytest.mark.other
+@pytest.mark.pr_test
 @pytest.mark.parametrize('run_config', TURBOMIND_PR_TEST_MLLM_GPU1)
 def test_pipeline_pr_test(config, run_config, worker_id):
+    worker_id = 'gw' + str(6 + get_workerid(worker_id))
+    run_pipeline_mllm_test(config, run_config, worker_id, is_smoke=True)
+
+
+@pytest.mark.gpu_num_2
+@pytest.mark.other
+@pytest.mark.pr_test
+@pytest.mark.parametrize('run_config', TURBOMIND_PR_TEST_MLLM_GPU2)
+def test_pipeline_pr_tp2_test(config, run_config, worker_id):
+    worker_id = 'gw' + str(3 + get_workerid(worker_id))
     run_pipeline_mllm_test(config, run_config, worker_id, is_smoke=True)
diff --git a/autotest/tools/quantization/test_quantization_awq.py b/autotest/tools/quantization/test_quantization_awq.py
index a92e0d5420..7a6bcb1e52 100644
--- a/autotest/tools/quantization/test_quantization_awq.py
+++ b/autotest/tools/quantization/test_quantization_awq.py
@@ -30,7 +30,7 @@ def test_quantization_gptq(config, model, worker_id):
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.timeout(900)
-@pytest.mark.parametrize('model', ['internlm/internlm2_5-20b-chat'])
+@pytest.mark.parametrize('model', ['Qwen/Qwen3-0.6B'])
 def test_quantization_awq_pr(config, model):
     quantization_type = 'awq'
     quantization_all(config, model + '-inner-4bits', model, quantization_type, cuda_prefix='CUDA_VISIBLE_DEVICES=6')
diff --git a/autotest/utils/common_utils.py b/autotest/utils/common_utils.py
index 3a7fcd473f..f54c3aa489 100644
--- a/autotest/utils/common_utils.py
+++ b/autotest/utils/common_utils.py
@@ -1,14 +1,13 @@
 import os
 import subprocess
 import sys
-from typing import Tuple
 
 
 def execute_command_with_logging(cmd,
                                  log_file_path: str,
                                  timeout: int = 3600,
                                  env=None,
-                                 should_print=True) -> Tuple[bool, str]:
+                                 should_print=True) -> tuple[bool, str]:
     if env is None:
         env = os.environ.copy()
 
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 362a97ac67..3d71fe1e0d 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -1,7 +1,7 @@
 import copy
 import os
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import yaml
 
@@ -12,7 +12,7 @@
 SUFFIX_INNER_W8A8 = '-inner-w8a8'
 
 
-def resolve_extra_params(extra_params: Dict[str, Any], model_base_path: str) -> None:
+def resolve_extra_params(extra_params: dict[str, Any], model_base_path: str) -> None:
     """Resolve relative model paths in extra_params to absolute paths.
 
     Centralised helper so that every call-site does not need its own
@@ -37,10 +37,10 @@ def resolve_extra_params(extra_params: Dict[str, Any], model_base_path: str) ->
 
 
 def get_func_config_list(backend: str,
-                         parallel_config: Dict[str, int],
+                         parallel_config: dict[str, int],
                          model_type: str = 'chat_model',
                          func_type: str = 'func',
-                         extra: Optional[Dict[str, Any]] = None) -> List[Dict]:
+                         extra: dict[str, Any] | None = None) -> list[dict[str, Any]]:
     """Generate all valid running config combinations (communicator + quant
     policy + model).
 
@@ -51,7 +51,7 @@ def get_func_config_list(backend: str,
         func_type: Test func type filter, default: func
         extra: extra config to update in each run config dict
     Returns:
-        List[Dict]: All valid run config dicts
+        list[dict]: All valid run config dicts
     """
     config = get_config()
     device = config.get('device', 'cuda')
@@ -105,6 +105,10 @@ def get_func_config_list(backend: str,
         if config.get('env_tag', '') in ['3090', '5080']:
             run_config['extra_params']['cache-max-entry-count'] = 0.5
 
+        if config.get('env_tag', '') in ['a100'] and ('Qwen3-235B-A22B' in run_config['model']
+                                                      or run_config['model'] == 'internlm/Intern-S1'):
+            run_config['extra_params']['cache-max-entry-count'] = 0.6
+
         if 'sdar' in run_config['model'].lower():
             run_config['extra_params']['dllm-block-length'] = 4
             run_config['extra_params']['dllm-denoising-steps'] = 4
@@ -127,7 +131,7 @@ def get_func_config_list(backend: str,
     return run_configs
 
 
-def get_cli_common_param(run_config: Dict[str, Any]) -> str:
+def get_cli_common_param(run_config: dict[str, Any]) -> str:
     """Generate cli common params string by run config dict."""
     backend = run_config.get('backend')
     model = run_config.get('model')
@@ -162,7 +166,7 @@ def get_cli_common_param(run_config: Dict[str, Any]) -> str:
     return ' '.join(cli_params).strip()
 
 
-def get_cli_str(config: Dict[str, Any]) -> str:
+def get_cli_str(config: dict[str, Any]) -> str:
     cli_str = []
     # Extra params
     for key, value in config.items():
@@ -181,7 +185,7 @@ def get_cli_str(config: Dict[str, Any]) -> str:
     return ' '.join(cli_str)
 
 
-def get_parallel_config(config: Dict, model_name: str) -> List[Dict[str, int]]:
+def get_parallel_config(config: dict[str, Any], model_name: str) -> list[dict[str, int]]:
     """Get matched parallel config dict by model name, default tp:1 if no
     match."""
     result = []
@@ -201,23 +205,23 @@ def get_parallel_config(config: Dict, model_name: str) -> List[Dict[str, int]]:
     return result if result else [{'tp': 1}]
 
 
-def _extract_models_from_config(config_value: Any) -> List[str]:
+def _extract_models_from_config(config_value: Any) -> list[str]:
     """Extract flat model name list from config value (dict/list supported)"""
     models = []
-    if isinstance(config_value, Dict):
+    if isinstance(config_value, dict):
         for model_list in config_value.values():
-            if isinstance(model_list, List):
+            if isinstance(model_list, list):
                 models.extend([m for m in model_list if isinstance(m, str)])
-    elif isinstance(config_value, List):
+    elif isinstance(config_value, list):
         models.extend([m for m in config_value if isinstance(m, str)])
     return models
 
 
-def get_model_list(config: Dict,
+def get_model_list(config: dict[str, Any],
                    backend: str,
-                   parallel_config: Dict[str, int] = None,
+                   parallel_config: dict[str, int] | None = None,
                    model_type: str = 'chat_model',
-                   func_type: str = 'func') -> List[str]:
+                   func_type: str = 'func') -> list[str]:
     """Get filtered model list with quantization extended models by
     backend/parallel config/model type/func type.
 
@@ -228,7 +232,7 @@ def get_model_list(config: Dict,
         model_type: Model type, default: chat_model
         func_type: Test func type filter, default: func
     Returns:
-        List[str]: Base models + quantization extended models
+        list[str]: Base models + quantization extended models
     """
     model_config_key = f'{backend}_{model_type}'
     all_models = []
@@ -252,7 +256,7 @@ def get_model_list(config: Dict,
     return extended_models
 
 
-def _filter_by_test_func_type(config: Dict, model_list: List[str], func_type: str) -> List[str]:
+def _filter_by_test_func_type(config: dict[str, Any], model_list: list[str], func_type: str) -> list[str]:
     """Filter model list by test function type, return intersection of two
     model sets."""
     if func_type == 'func':
@@ -266,7 +270,8 @@ def _filter_by_test_func_type(config: Dict, model_list: List[str], func_type: st
     return list(set(filtered_models) & set(model_list))
 
 
-def _extend_turbomind_quant_models(quant_config: dict, base_models: list, target_list: list) -> None:
+def _extend_turbomind_quant_models(quant_config: dict[str, Any], base_models: list[str],
+                                   target_list: list[str]) -> None:
     """Append turbomind quantization models to target list (AWQ 4bits +
     GPTQ)"""
     no_awq_models = quant_config.get('no_awq', [])
@@ -280,7 +285,7 @@ def _extend_turbomind_quant_models(quant_config: dict, base_models: list, target
             target_list.append(model_name + SUFFIX_INNER_GPTQ)
 
 
-def _extend_pytorch_quant_models(quant_config: dict, base_models: list, target_list: list) -> None:
+def _extend_pytorch_quant_models(quant_config: dict[str, Any], base_models: list[str], target_list: list[str]) -> None:
     """Append pytorch quantization models to target list (AWQ 4bits + W8A8)"""
     # Append AWQ quantization models
     for model_name in quant_config.get('awq', []):
@@ -292,7 +297,7 @@ def _extend_pytorch_quant_models(quant_config: dict, base_models: list, target_l
             target_list.append(model_name + SUFFIX_INNER_W8A8)
 
 
-def _is_kvint_model(config: Dict, backend: str, model: str, quant_policy: int) -> bool:
+def _is_kvint_model(config: dict[str, Any], backend: str, model: str, quant_policy: int) -> bool:
     """Check if model supports the kv quantization policy, quant_policy=0
     always return True."""
     if quant_policy == 0:
@@ -308,7 +313,7 @@ def _base_model_name(model: str) -> str:
     return model.replace('-inner-4bits', '').replace('-inner-w8a8', '').replace('-inner-gptq', '')
 
 
-def get_quantization_model_list(type: str) -> List[str]:
+def get_quantization_model_list(type: str) -> list[str]:
     """Get quantization model list by specified quant type(awq/gptq/w8a8)"""
     config = get_config()
     quant_model_list = []
@@ -340,7 +345,7 @@ def get_quantization_model_list(type: str) -> List[str]:
     return quant_model_list
 
 
-def get_config() -> Dict[str, Any]:
+def get_config() -> dict[str, Any]:
     """Load & get yaml config file, auto adapt device env & update log path."""
     # Get device env & match config file path
     env_tag = os.environ.get('TEST_ENV')
@@ -370,7 +375,7 @@ def get_config() -> Dict[str, Any]:
     return config_copy
 
 
-def get_cuda_prefix_by_workerid(worker_id: Optional[str], parallel_config: Dict[str, int] = None) -> Optional[str]:
+def get_cuda_prefix_by_workerid(worker_id: str | None, parallel_config: dict[str, int] | None = None) -> str | None:
     """Get cuda/ascend visible devices env prefix by worker id & parallel
     config."""
     para_conf = parallel_config or {}
@@ -387,7 +392,7 @@ def get_cuda_prefix_by_workerid(worker_id: Optional[str], parallel_config: Dict[
     return f'ASCEND_RT_VISIBLE_DEVICES={cuda_id}' if device_type == 'ascend' else f'CUDA_VISIBLE_DEVICES={cuda_id}'
 
 
-def get_cuda_id_by_workerid(worker_id: Optional[str], tp_num: int = 1) -> Optional[str]:
+def get_cuda_id_by_workerid(worker_id: str | None, tp_num: int = 1) -> str | None:
     """Get cuda id str by worker id and tp num, return None if invalid worker
     id."""
     if worker_id is None or 'gw' not in worker_id:
@@ -398,7 +403,7 @@ def get_cuda_id_by_workerid(worker_id: Optional[str], tp_num: int = 1) -> Option
     return ','.join([str(cuda_num + i) for i in range(tp_num)])
 
 
-def get_workerid(worker_id: Optional[str]) -> int:
+def get_workerid(worker_id: str | None) -> int:
     """Parse numeric worker id from worker id str, return 0 if invalid worker
     id."""
     if worker_id is None or 'gw' not in worker_id:
@@ -413,7 +418,9 @@ def is_quantization_model(model: str) -> bool:
     return any(key in lower_name for key in ('awq', '4bits', 'w4', 'int4'))
 
 
-def _get_communicator_list(config: Dict, backend: str, parallel_config: Dict[str, int] = None) -> List[str]:
+def _get_communicator_list(config: dict[str, Any],
+                           backend: str,
+                           parallel_config: dict[str, int] | None = None) -> list[str]:
     """Get available communicator list by device and parallel config."""
     device = config.get('device', None)
 
@@ -429,7 +436,7 @@ def _get_communicator_list(config: Dict, backend: str, parallel_config: Dict[str
     return ['nccl', 'cuda-ipc']
 
 
-def set_device_env_variable(worker_id, parallel_config: Dict[str, int] = None):
+def set_device_env_variable(worker_id: str | None, parallel_config: dict[str, int] | None = None) -> None:
     """Set device environment variable based on the device type."""
     device = os.environ.get('DEVICE', 'cuda')
 
@@ -460,13 +467,13 @@ def unset_device_env_variable():
             del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-def is_model_in_list(config: Dict, parallel_config: Dict[str, int], model: str) -> bool:
+def is_model_in_list(config: dict[str, Any], parallel_config: dict[str, int], model: str) -> bool:
     """Check if model matches the target parallel config."""
     model_config = get_parallel_config(config, model)
     return parallel_config in model_config
 
 
-def get_case_str_by_config(run_config: Dict[str, Any], is_simple: bool = True) -> str:
+def get_case_str_by_config(run_config: dict[str, Any], is_simple: bool = True) -> str:
     """Generate case name string by run config dict."""
     model_name = run_config['model']
     backend_type = run_config['backend']
@@ -491,7 +498,7 @@ def get_case_str_by_config(run_config: Dict[str, Any], is_simple: bool = True) -
     return f'{backend_type}_{pure_model_name}_{communicator}_{parallel_str}_{quant_policy}{extra_params_case}'
 
 
-def parse_config_by_case(case_str: str) -> Dict[str, Any]:
+def parse_config_by_case(case_str: str) -> dict[str, Any]:
     """Parse run config dict from case name string (fix split & type convert
     bug)"""
     case_parts = case_str.split('_')
diff --git a/autotest/utils/constant.py b/autotest/utils/constant.py
index 0a3e78a018..153d3220e7 100644
--- a/autotest/utils/constant.py
+++ b/autotest/utils/constant.py
@@ -153,9 +153,9 @@
 
 RESTFUL_MODEL_LIST = [
     'Qwen/Qwen3-0.6B', 'Qwen/Qwen3-VL-2B-Instruct', 'Qwen/Qwen3-30B-A3B', 'internlm/Intern-S1',
-    'internlm/internlm2_5-20b-chat', 'internlm/internlm2_5-20b', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B',
-    'OpenGVLab/InternVL3-38B', 'Qwen/Qwen3-VL-8B-Instruct', 'internlm/internlm3-8b-instruct',
-    'meta-llama/Llama-3.2-3B-Instruct', 'Qwen/Qwen3-VL-30B-A3B-Instruct'
+    'internlm/internlm2_5-20b', 'Qwen/Qwen3-32B', 'OpenGVLab/InternVL3_5-30B-A3B', 'OpenGVLab/InternVL3-38B',
+    'Qwen/Qwen3-VL-8B-Instruct', 'internlm/internlm3-8b-instruct', 'meta-llama/Llama-3.2-3B-Instruct',
+    'Qwen/Qwen3-VL-30B-A3B-Instruct'
 ]
 
 RESTFUL_BASE_MODEL_LIST = [
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index f3e6694840..61576c841a 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -50,7 +50,7 @@ def run_pipeline_llm_test(config, run_config, common_case_config, worker_id: str
         for case in common_case_config.keys():
             if is_smoke and case != 'memory_test':
                 continue
-            if case != 'code_testcases' and 'code' in model_path.lower():
+            if case != 'code_testcase' and 'code' in model_path.lower():
                 continue
 
             with allure.step(case):
diff --git a/autotest/utils/proxy_distributed_utils.py b/autotest/utils/proxy_distributed_utils.py
index dc4efdebad..0472af3953 100644
--- a/autotest/utils/proxy_distributed_utils.py
+++ b/autotest/utils/proxy_distributed_utils.py
@@ -3,7 +3,7 @@
 import socket
 import subprocess
 import time
-from typing import Any, Dict, Tuple
+from typing import Any
 
 import requests
 from utils.config_utils import get_case_str_by_config, get_cli_common_param, resolve_extra_params
@@ -28,7 +28,7 @@ def is_port_open(host: str, port: int, timeout: float = 1.0) -> bool:
 
 def check_nodes_status(host: str, proxy_port: int, model_name: str, expected_instances: int, check_count: int,
                        current_time: float, last_progress_print: float,
-                       progress_print_interval: int) -> Tuple[bool, int]:
+                       progress_print_interval: int) -> tuple[bool, int]:
     try:
         nodes_url = f'http://{host}:{proxy_port}/nodes/status'
         resp = requests.get(nodes_url, timeout=10)
@@ -215,7 +215,7 @@ def cleanup(self):
 
 class ApiServerPerTest:
 
-    def __init__(self, proxy_manager: ProxyDistributedManager, config: Dict[str, Any], run_config: Dict[str, Any]):
+    def __init__(self, proxy_manager: ProxyDistributedManager, config: dict[str, Any], run_config: dict[str, Any]):
         self.proxy_manager = proxy_manager
         self.config = config
         self.run_config = run_config
diff --git a/autotest/utils/ray_distributed_utils.py b/autotest/utils/ray_distributed_utils.py
index 2b87a4bb41..919745632a 100644
--- a/autotest/utils/ray_distributed_utils.py
+++ b/autotest/utils/ray_distributed_utils.py
@@ -4,7 +4,7 @@
 import subprocess
 import time
 from time import time as time_time
-from typing import Any, Dict
+from typing import Any
 
 import requests
 from utils.config_utils import get_case_str_by_config, get_cli_common_param, resolve_extra_params
@@ -150,7 +150,7 @@ def start_ray_cluster(self):
             print(f'💥 Ray startup failed: {e.stderr}')
             raise
 
-    def start_lmdeploy_api_server(self, config: dict, run_config: dict):
+    def start_lmdeploy_api_server(self, config: dict[str, Any], run_config: dict[str, Any]) -> None:
         """
         Master node: Start LMDeploy API Server and wait for it to be ready.
         Worker nodes: Do not start the service, only verify that the master node's API Server is ready.
@@ -252,7 +252,7 @@ def cleanup(self, force: bool = True):
                 print(f'⚠️ Ray stop exception: {e}')
             self._cleaned = True  # Only mark as "fully cleaned" when force=True
 
-    def get_cluster_info(self) -> Dict[str, Any]:
+    def get_cluster_info(self) -> dict[str, Any]:
         return {
             'node_rank': self.node_rank,
             'node_count': self.node_count,
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 13192d37c5..7cb93166a8 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -123,6 +123,8 @@ def run_all_step(log_path, case_name, cases_info, port: int = DEFAULT_PORT):
     if model is None:
         assert False, 'server not start correctly'
     for case in cases_info.keys():
+        if case != 'code_testcase' and 'code' in model.lower():
+            continue
         case_info = cases_info.get(case)
 
         with allure.step(case + ' restful_test - openai chat'):
@@ -153,17 +155,34 @@ def open_chat_test(log_path, case_name, case_info, url):
         messages.append({'role': 'user', 'content': prompt})
         file.writelines('prompt:' + prompt + '\n')
 
-        response = client.chat.completions.create(model=model_name,
-                                                  messages=messages,
-                                                  temperature=0.01,
-                                                  top_p=0.8,
-                                                  max_completion_tokens=1024)
-
-        output_content = response.choices[0].message.content
-        file.writelines('output:' + output_content + '\n')
+        outputs = client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 temperature=0.01,
+                                                 top_p=0.8,
+                                                 max_completion_tokens=1024,
+                                                 stream=True)
+
+        content_chunks = []
+        reasoning_content_chunks = []
+        for output in outputs:
+            # Safely handle streaming chunks: choices may be empty and content may be None
+            if not getattr(output, 'choices', None):
+                continue
+            choice = output.choices[0]
+            delta = getattr(choice, 'delta', None)
+            reasoning_content = getattr(delta, 'reasoning_content', None) if delta is not None else None
+            content = getattr(delta, 'content', None) if delta is not None else None
+            if reasoning_content:
+                reasoning_content_chunks.append(reasoning_content)
+            if content:
+                content_chunks.append(content)
+        reasoning_content = ''.join(reasoning_content_chunks)
+        output_content = ''.join(content_chunks)
+
+        file.writelines(f'reasoning_content :{reasoning_content}, content: {output_content}\n')
         messages.append({'role': 'assistant', 'content': output_content})
 
-        case_result, reason = assert_result(output_content, prompt_detail.values(), model_name)
+        case_result, reason = assert_result(reasoning_content + output_content, prompt_detail.values(), model_name)
         file.writelines('result:' + str(case_result) + ',reason:' + reason + '\n')
         if not case_result:
             msg += reason
diff --git a/autotest/utils/toolkit.py b/autotest/utils/toolkit.py
index 7341c9d044..28078c1336 100644
--- a/autotest/utils/toolkit.py
+++ b/autotest/utils/toolkit.py
@@ -1,10 +1,9 @@
 from functools import lru_cache
-from typing import List
 
 from transformers import AutoTokenizer
 
 
-def parse_sse_stream(content: str) -> list:
+def parse_sse_stream(content: str) -> list[str]:
     """Parse SSE (Server-Sent Events) stream content into a list of events.
 
     Each event is either a JSON string or "[DONE]".
@@ -31,7 +30,7 @@ def _load_tokenizer_cached(model_path: str):
         raise RuntimeError(f"Failed to load tokenizer from '{model_path}': {e}")
 
 
-def encode_text(model_path: str, text: str) -> List[int]:
+def encode_text(model_path: str, text: str) -> list[int]:
     tokenizer = _load_tokenizer_cached(model_path)
 
     encoded = tokenizer.encode(text)