From db0a28033d8bfabeb991808c7b28bcbf1f7e697a Mon Sep 17 00:00:00 2001
From: songhappy <guoqiongsong@gmail.com>
Date: Tue, 27 Aug 2024 19:12:02 -0700
Subject: [PATCH 1/6] test main 3072-384

---
 .github/workflows/llm_performance_tests.yml | 610 ++++++++++----------
 python/llm/dev/benchmark/all-in-one/run.py  |   6 +
 2 files changed, 311 insertions(+), 305 deletions(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 4b092ed04cf..37e6243f341 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -640,359 +640,359 @@ jobs:
                       run_transformer_int4_fp16_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size, streaming)
           ' python/llm/dev/benchmark/all-in-one/run.py
 
-      # 32-32 int4+fp16
-      - name: Prepare igpu perf test (32-32 int4+fp16)
-        shell: bash
-        run: |
-          # hide time info
-          # sed -i 's/str(end - st)/"xxxxxx"/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{api}-results-{today}.csv/32-32-{api}-results-{today}_test1.csv/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml
-
-      - name: Test on igpu (32-32 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
-          REM for llava
-          set TRANSFORMERS_OFFLINE=1
-
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      # # 32-32 int4+fp16
+      # - name: Prepare igpu perf test (32-32 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     # hide time info
+      #     # sed -i 's/str(end - st)/"xxxxxx"/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i 's/{api}-results-{today}.csv/32-32-{api}-results-{today}_test1.csv/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml
 
-          call conda deactivate
+      # - name: Test on igpu (32-32 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
+      #     REM for llava
+      #     set TRANSFORMERS_OFFLINE=1
+
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+
+      #     call conda deactivate
+
+      # - name: Prepare igpu perf test for transformers 4.36 (32-32 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_436.yaml
 
-      - name: Prepare igpu perf test for transformers 4.36 (32-32 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_436.yaml
+      # - name: Test on igpu for transformers 4.36 (32-32 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.36.2
 
-      - name: Test on igpu for transformers 4.36 (32-32 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.36.2
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
 
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_436.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_436.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-
-          call conda deactivate
+      #     call conda deactivate
           
-      - name: Prepare igpu perf test for transformers 4.38 (32-32 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_438.yaml
-
-      - name: Test on igpu for transformers 4.38 (32-32 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.38.2
-
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
-
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_438.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-
-          call conda deactivate
-
-      - name: Prepare igpu perf test for transformers 4.43 (32-32 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_443.yaml
-
-      - name: Test on igpu for transformers 4.43 (32-32 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.43.1
-          pip install trl
-
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
-
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_443.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-
-          pip uninstall trl -y
-          call conda deactivate
-
-      - name: Concat csv and generate html (32-32 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate html-gen
-
-          cd python\llm\dev\benchmark\all-in-one
-          python ..\..\..\test\benchmark\concat_csv.py
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          del /q *test*.csv
-          move *.csv %CSV_SAVE_PATH%\32-32_int4_fp16\
-          cd ..\..\..\test\benchmark
-          python csv_to_html.py -f %CSV_SAVE_PATH%\32-32_int4_fp16\
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          move %CSV_SAVE_PATH%\32-32_int4_fp16\*.html %CSV_SAVE_PATH%
-
-          call conda deactivate
-
-      # TODO: create a action function here for different input
-      # 1024-128 int4+fp16
-      - name: Prepare igpu perf test (1024-128 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
-
-      - name: Test on igpu (1024-128 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.37.0
+      # - name: Prepare igpu perf test for transformers 4.38 (32-32 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_438.yaml
 
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
-          REM for llava
-          set TRANSFORMERS_OFFLINE=1
+      # - name: Test on igpu for transformers 4.38 (32-32 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.38.2
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
 
-          call conda deactivate
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_438.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      - name: Prepare igpu perf test for transformers 4.36 (1024-128 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_436.yaml
+      #     call conda deactivate
 
-      - name: Test on igpu for transformers 4.36 (1024-128 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.36.2
+      # - name: Prepare igpu perf test for transformers 4.43 (32-32 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_443.yaml
 
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      # - name: Test on igpu for transformers 4.43 (32-32 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.43.1
+      #     pip install trl
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_436.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
 
-          call conda deactivate
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_443.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      - name: Prepare igpu perf test for transformers 4.38 (1024-128 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_438.yaml
+      #     pip uninstall trl -y
+      #     call conda deactivate
 
-      - name: Test on igpu for transformers 4.38 (1024-128 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.38.2
+      # - name: Concat csv and generate html (32-32 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate html-gen
+
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     python ..\..\..\test\benchmark\concat_csv.py
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     del /q *test*.csv
+      #     move *.csv %CSV_SAVE_PATH%\32-32_int4_fp16\
+      #     cd ..\..\..\test\benchmark
+      #     python csv_to_html.py -f %CSV_SAVE_PATH%\32-32_int4_fp16\
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     move %CSV_SAVE_PATH%\32-32_int4_fp16\*.html %CSV_SAVE_PATH%
+
+      #     call conda deactivate
+
+      # # TODO: create a action function here for different input
+      # # 1024-128 int4+fp16
+      # - name: Prepare igpu perf test (1024-128 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
 
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      # - name: Test on igpu (1024-128 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.37.0
+
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
+      #     REM for llava
+      #     set TRANSFORMERS_OFFLINE=1
+
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+
+      #     call conda deactivate
+
+      # - name: Prepare igpu perf test for transformers 4.36 (1024-128 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_436.yaml
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_438.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      # - name: Test on igpu for transformers 4.36 (1024-128 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.36.2
 
-          call conda deactivate
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
 
-      - name: Prepare igpu perf test for transformers 4.43 (1024-128 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_443.yaml
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_436.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      - name: Test on igpu for transformers 4.43 (1024-128 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.43.1
-          pip install trl
+      #     call conda deactivate
 
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      # - name: Prepare igpu perf test for transformers 4.38 (1024-128 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_438.yaml
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_443.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      # - name: Test on igpu for transformers 4.38 (1024-128 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.38.2
 
-          pip uninstall trl -y
-          call conda deactivate
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
 
-      - name: Concat csv and generate html (1024-128 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate html-gen
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_438.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          cd python\llm\dev\benchmark\all-in-one
-          python ..\..\..\test\benchmark\concat_csv.py
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          del /q *test*.csv
-          move *.csv %CSV_SAVE_PATH%\1024-128_int4_fp16\
-          cd ..\..\..\test\benchmark
-          python csv_to_html.py -f %CSV_SAVE_PATH%\1024-128_int4_fp16\
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          move %CSV_SAVE_PATH%\1024-128_int4_fp16\*.html %CSV_SAVE_PATH%
+      #     call conda deactivate
 
-          call conda deactivate
+      # - name: Prepare igpu perf test for transformers 4.43 (1024-128 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_443.yaml
 
-      # 2048-256 int4+fp16
-      - name: Prepare igpu perf test (2048-256 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml
+      # - name: Test on igpu for transformers 4.43 (1024-128 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.43.1
+      #     pip install trl
 
-      - name: Test on igpu (2048-256 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.37.0
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
 
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
-          REM for llava
-          set TRANSFORMERS_OFFLINE=1
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_443.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     pip uninstall trl -y
+      #     call conda deactivate
 
-          call conda deactivate
+      # - name: Concat csv and generate html (1024-128 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate html-gen
+
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     python ..\..\..\test\benchmark\concat_csv.py
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     del /q *test*.csv
+      #     move *.csv %CSV_SAVE_PATH%\1024-128_int4_fp16\
+      #     cd ..\..\..\test\benchmark
+      #     python csv_to_html.py -f %CSV_SAVE_PATH%\1024-128_int4_fp16\
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     move %CSV_SAVE_PATH%\1024-128_int4_fp16\*.html %CSV_SAVE_PATH%
+
+      #     call conda deactivate
+
+      # # 2048-256 int4+fp16
+      # - name: Prepare igpu perf test (2048-256 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml
 
-      - name: Prepare igpu perf test for transformers 4.36 (2048-256 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_436.yaml
+      # - name: Test on igpu (2048-256 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.37.0
+
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
+      #     REM for llava
+      #     set TRANSFORMERS_OFFLINE=1
+
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+
+      #     call conda deactivate
+
+      # - name: Prepare igpu perf test for transformers 4.36 (2048-256 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_436.yaml
 
-      - name: Test on igpu for transformers 4.36 (2048-256 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.36.2
+      # - name: Test on igpu for transformers 4.36 (2048-256 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.36.2
 
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_436.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_436.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          call conda deactivate
+      #     call conda deactivate
 
-      - name: Prepare igpu perf test for transformers 4.38 (2048-256 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_438.yaml
+      # - name: Prepare igpu perf test for transformers 4.38 (2048-256 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_438.yaml
 
-      - name: Test on igpu for transformers 4.38 (2048-256 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.38.2
+      # - name: Test on igpu for transformers 4.38 (2048-256 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.38.2
 
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_438.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_438.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          call conda deactivate
+      #     call conda deactivate
 
-      - name: Prepare igpu perf test for transformers 4.43 (2048-256 int4+fp16)
-        shell: bash
-        run: |
-          sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
-          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_443.yaml
+      # - name: Prepare igpu perf test for transformers 4.43 (2048-256 int4+fp16)
+      #   shell: bash
+      #   run: |
+      #     sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
+      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_443.yaml
 
-      - name: Test on igpu for transformers 4.43 (2048-256 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate igpu-perf
-          pip install transformers==4.43.1
-          pip install trl
+      # - name: Test on igpu for transformers 4.43 (2048-256 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate igpu-perf
+      #     pip install transformers==4.43.1
+      #     pip install trl
 
-          set SYCL_CACHE_PERSISTENT=1
-          set BIGDL_LLM_XMX_DISABLED=1
+      #     set SYCL_CACHE_PERSISTENT=1
+      #     set BIGDL_LLM_XMX_DISABLED=1
 
-          cd python\llm\dev\benchmark\all-in-one
-          move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_443.yaml config.yaml
-          set PYTHONIOENCODING=utf-8
-          python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
-          if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_443.yaml config.yaml
+      #     set PYTHONIOENCODING=utf-8
+      #     python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
 
-          pip uninstall trl -y
-          call conda deactivate
+      #     pip uninstall trl -y
+      #     call conda deactivate
 
-      - name: Concat csv and generate html (2048-256 int4+fp16)
+      # - name: Concat csv and generate html (2048-256 int4+fp16)
         shell: cmd
         run: |
           call conda activate html-gen
diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 4b5a64d664a..7df128e9b59 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -2049,6 +2049,9 @@ def run_pipeline_parallel_gpu(repo_id,
         for batch_size in batch_list:
             for model in conf.repo_id:
                 in_out_pairs = conf['in_out_pairs'].copy()
+                print("-------------------- Start running batch_size: {} --------------------".format(batch_size))
+                print("-------------------- Start running model: {} --------------------".format(model))
+                print("--------------------in_out_pairs: {}--------------------".format(in_out_pairs))
                 if excludes:
                     for in_out in conf['in_out_pairs']:
                         model_id_input = model + ':' + in_out.split('-')[0]
@@ -2059,9 +2062,11 @@ def run_pipeline_parallel_gpu(repo_id,
                     lookahead = True
                 run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
                       conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, lookahead, task, optimize_model)
+                print("-------------------- Finish running model: {} --------------------".format(model))
         df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
                                             'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
                                             'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype'])
+        print("-------------------- Results: {} --------------------".format(results))
         if "pipeline" in api or "deepspeed" in api:
             if torch.distributed.get_rank() == 0:
                 df.index += max(line_counter - 1, 0)
@@ -2079,3 +2084,4 @@ def run_pipeline_parallel_gpu(repo_id,
                     df.to_csv(csv_name, mode='a', header=None, encoding='utf-8')
             line_counter += len(df.index)
         results = []
+

From 27f75f537c80b2149770e1cd17f667288a624519 Mon Sep 17 00:00:00 2001
From: songhappy <guoqiongsong@gmail.com>
Date: Tue, 27 Aug 2024 19:17:46 -0700
Subject: [PATCH 2/6] main 3k test

---
 .github/workflows/llm_performance_tests.yml | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 37e6243f341..a4b1fd2ee55 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -992,22 +992,22 @@ jobs:
       #     pip uninstall trl -y
       #     call conda deactivate
 
-      # - name: Concat csv and generate html (2048-256 int4+fp16)
-        shell: cmd
-        run: |
-          call conda activate html-gen
+      # # - name: Concat csv and generate html (2048-256 int4+fp16)
+      #   shell: cmd
+      #   run: |
+      #     call conda activate html-gen
 
-          cd python\llm\dev\benchmark\all-in-one
-          python ..\..\..\test\benchmark\concat_csv.py
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          del /q *test*.csv
-          move *.csv %CSV_SAVE_PATH%\2048-256_int4_fp16\
-          cd ..\..\..\test\benchmark
-          python csv_to_html.py -f %CSV_SAVE_PATH%\2048-256_int4_fp16\
-          if %ERRORLEVEL% neq 0 (exit /b 1)
-          move %CSV_SAVE_PATH%\2048-256_int4_fp16\*.html %CSV_SAVE_PATH%
+      #     cd python\llm\dev\benchmark\all-in-one
+      #     python ..\..\..\test\benchmark\concat_csv.py
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     del /q *test*.csv
+      #     move *.csv %CSV_SAVE_PATH%\2048-256_int4_fp16\
+      #     cd ..\..\..\test\benchmark
+      #     python csv_to_html.py -f %CSV_SAVE_PATH%\2048-256_int4_fp16\
+      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      #     move %CSV_SAVE_PATH%\2048-256_int4_fp16\*.html %CSV_SAVE_PATH%
 
-          call conda deactivate
+      #     call conda deactivate
 
       # 3072-384 int4+fp16
       - name: Prepare igpu perf test (3072-384 int4+fp16)

From 12865dd98bff58294492db6d6fef30374b36c092 Mon Sep 17 00:00:00 2001
From: songhappy <guoqiongsong@gmail.com>
Date: Tue, 27 Aug 2024 22:52:01 -0700
Subject: [PATCH 3/6] log conf

---
 python/llm/dev/benchmark/all-in-one/run.py                  | 4 ++++
 python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 7df128e9b59..72a25691151 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -2038,6 +2038,8 @@ def run_pipeline_parallel_gpu(repo_id,
     for api in conf.test_api:
         global csv_name
         csv_name = f'{current_dir}/{api}-results-{today}.csv'
+        print("-------------------- csv_name: {} --------------------".format(csv_name))
+        print(conf)
         try:
             line_counter = len(open(csv_name).readlines())
         except:
@@ -2066,6 +2068,8 @@ def run_pipeline_parallel_gpu(repo_id,
         df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
                                             'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
                                             'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype'])
+        print("-------------------- Results df:--------------------")
+        print(df)
         print("-------------------- Results: {} --------------------".format(results))
         if "pipeline" in api or "deepspeed" in api:
             if torch.distributed.get_rank() == 0:
diff --git a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml
index 60202594cba..edca0e7b67d 100644
--- a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml
@@ -19,7 +19,7 @@ repo_id:
   - 'openbmb/MiniCPM-V-2_6'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
-num_trials: 3
+num_trials: 1
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 batch_size: 1 # default to 1

From 6a6549ff5ea26adb5669eac498e5b533df910208 Mon Sep 17 00:00:00 2001
From: songhappy <guoqiongsong@gmail.com>
Date: Tue, 27 Aug 2024 23:32:33 -0700
Subject: [PATCH 4/6] 41 new run

---
 .github/workflows/llm_performance_tests.yml   | 667 +++++++++---------
 python/llm/dev/benchmark/all-in-one/run.py    |   5 +-
 .../src/ipex_llm/utils/benchmark_util_4_29.py |   2 +-
 .../igpu-perf/3072-384_int4_fp16.yaml         |   2 +-
 4 files changed, 339 insertions(+), 337 deletions(-)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index a4b1fd2ee55..220bf2fe0fa 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -154,7 +154,7 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           export USE_XETLA=OFF
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2, trl
           cp python/llm/test/benchmark/arc-perf-transformers-436.yaml python/llm/dev/benchmark/all-in-one/config.yaml
           cd python/llm/dev/benchmark/all-in-one
           mkdir test_batch1
@@ -191,7 +191,7 @@ jobs:
           export USE_XETLA=OFF
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
           # upgrade for default transformers version
-          python -m pip install transformers==4.37.0
+          python -m pip install transformers==4.41.2, trl
           # batch_size 1
           cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml
           cd python/llm/dev/benchmark/all-in-one
@@ -223,7 +223,7 @@ jobs:
           export USE_XETLA=OFF
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
           # upgrade transformers for model Qwen/Qwen1.5-MoE-A2.7B-Chat
-          python -m pip install transformers==4.40.0
+          python -m pip install transformers==4.41.2
           python -m pip install trl
           # batch_size 1
           cp python/llm/test/benchmark/arc-perf-transformers-440.yaml python/llm/dev/benchmark/all-in-one/config.yaml
@@ -557,7 +557,7 @@ jobs:
           pip install --upgrade pip
           pip install --upgrade wheel
           pip install --upgrade omegaconf pandas
-          pip install --upgrade tiktoken einops transformers_stream_generator matplotlib
+          pip install --upgrade tiktoken einops transformers_stream_generator matplotlib trl
 
           cd python\llm
           python setup.py clean --all bdist_wheel --win
@@ -640,374 +640,375 @@ jobs:
                       run_transformer_int4_fp16_gpu_win(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, cpu_embedding, batch_size, streaming)
           ' python/llm/dev/benchmark/all-in-one/run.py
 
-      # # 32-32 int4+fp16
-      # - name: Prepare igpu perf test (32-32 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     # hide time info
-      #     # sed -i 's/str(end - st)/"xxxxxx"/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i 's/{api}-results-{today}.csv/32-32-{api}-results-{today}_test1.csv/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml
+      # 32-32 int4+fp16
+      - name: Prepare igpu perf test (32-32 int4+fp16)
+        shell: bash
+        run: |
+          # hide time info
+          # sed -i 's/str(end - st)/"xxxxxx"/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{api}-results-{today}.csv/32-32-{api}-results-{today}_test1.csv/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16.yaml
 
-      # - name: Test on igpu (32-32 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
-      #     REM for llava
-      #     set TRANSFORMERS_OFFLINE=1
-
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-
-      #     call conda deactivate
-
-      # - name: Prepare igpu perf test for transformers 4.36 (32-32 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_436.yaml
+      - name: Test on igpu (32-32 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.41.2 trl
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+          REM for llava
+          set TRANSFORMERS_OFFLINE=1
 
-      # - name: Test on igpu for transformers 4.36 (32-32 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.36.2
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
+          call conda deactivate
+
+      - name: Prepare igpu perf test for transformers 4.36 (32-32 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_436.yaml
+
+      - name: Test on igpu for transformers 4.36 (32-32 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.41.2 trl
+
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
 
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_436.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_436.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
+          if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      #     call conda deactivate
+          call conda deactivate
           
-      # - name: Prepare igpu perf test for transformers 4.38 (32-32 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_438.yaml
+      - name: Prepare igpu perf test for transformers 4.38 (32-32 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_438.yaml
 
-      # - name: Test on igpu for transformers 4.38 (32-32 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.38.2
+      - name: Test on igpu for transformers 4.38 (32-32 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.41.2 trl
 
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
 
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_438.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_438.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
+          if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      #     call conda deactivate
+          call conda deactivate
 
-      # - name: Prepare igpu perf test for transformers 4.43 (32-32 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_443.yaml
+      - name: Prepare igpu perf test for transformers 4.43 (32-32 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/32-32_int4_fp16_443.yaml
 
-      # - name: Test on igpu for transformers 4.43 (32-32 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.43.1
-      #     pip install trl
+      - name: Test on igpu for transformers 4.43 (32-32 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.43.1
+          pip install trl
 
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
 
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_443.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\32-32_int4_fp16_443.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\32-32_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 if %ERRORLEVEL% neq -1073740791 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
+          if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      #     pip uninstall trl -y
-      #     call conda deactivate
+          pip uninstall trl -y
+          call conda deactivate
 
-      # - name: Concat csv and generate html (32-32 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate html-gen
-
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     python ..\..\..\test\benchmark\concat_csv.py
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     del /q *test*.csv
-      #     move *.csv %CSV_SAVE_PATH%\32-32_int4_fp16\
-      #     cd ..\..\..\test\benchmark
-      #     python csv_to_html.py -f %CSV_SAVE_PATH%\32-32_int4_fp16\
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     move %CSV_SAVE_PATH%\32-32_int4_fp16\*.html %CSV_SAVE_PATH%
-
-      #     call conda deactivate
-
-      # # TODO: create a action function here for different input
-      # # 1024-128 int4+fp16
-      # - name: Prepare igpu perf test (1024-128 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
+      - name: Concat csv and generate html (32-32 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate html-gen
 
-      # - name: Test on igpu (1024-128 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.37.0
-
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
-      #     REM for llava
-      #     set TRANSFORMERS_OFFLINE=1
-
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-
-      #     call conda deactivate
-
-      # - name: Prepare igpu perf test for transformers 4.36 (1024-128 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_436.yaml
+          cd python\llm\dev\benchmark\all-in-one
+          python ..\..\..\test\benchmark\concat_csv.py
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          del /q *test*.csv
+          move *.csv %CSV_SAVE_PATH%\32-32_int4_fp16\
+          cd ..\..\..\test\benchmark
+          python csv_to_html.py -f %CSV_SAVE_PATH%\32-32_int4_fp16\
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          move %CSV_SAVE_PATH%\32-32_int4_fp16\*.html %CSV_SAVE_PATH%
 
-      # - name: Test on igpu for transformers 4.36 (1024-128 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.36.2
+          call conda deactivate
 
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
+      # TODO: create a action function here for different input
+      # 1024-128 int4+fp16
+      - name: Prepare igpu perf test (1024-128 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/32-32/1024-128/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16.yaml
 
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_436.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      - name: Test on igpu (1024-128 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.41.0 trl
 
-      #     call conda deactivate
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+          REM for llava
+          set TRANSFORMERS_OFFLINE=1
 
-      # - name: Prepare igpu perf test for transformers 4.38 (1024-128 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_438.yaml
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      # - name: Test on igpu for transformers 4.38 (1024-128 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.38.2
+          call conda deactivate
 
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
+      - name: Prepare igpu perf test for transformers 4.36 (1024-128 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_436.yaml
 
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_438.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      - name: Test on igpu for transformers 4.36 (1024-128 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.41.2 trl
 
-      #     call conda deactivate
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
 
-      # - name: Prepare igpu perf test for transformers 4.43 (1024-128 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_443.yaml
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_436.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
+          if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      # - name: Test on igpu for transformers 4.43 (1024-128 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.43.1
-      #     pip install trl
+          call conda deactivate
 
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
+      - name: Prepare igpu perf test for transformers 4.38 (1024-128 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_438.yaml
 
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_443.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+      - name: Test on igpu for transformers 4.38 (1024-128 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.41.2 trl
 
-      #     pip uninstall trl -y
-      #     call conda deactivate
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
 
-      # - name: Concat csv and generate html (1024-128 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate html-gen
-
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     python ..\..\..\test\benchmark\concat_csv.py
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     del /q *test*.csv
-      #     move *.csv %CSV_SAVE_PATH%\1024-128_int4_fp16\
-      #     cd ..\..\..\test\benchmark
-      #     python csv_to_html.py -f %CSV_SAVE_PATH%\1024-128_int4_fp16\
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     move %CSV_SAVE_PATH%\1024-128_int4_fp16\*.html %CSV_SAVE_PATH%
-
-      #     call conda deactivate
-
-      # # 2048-256 int4+fp16
-      # - name: Prepare igpu perf test (2048-256 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_438.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
+          if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      # - name: Test on igpu (2048-256 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.37.0
-
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
-      #     REM for llava
-      #     set TRANSFORMERS_OFFLINE=1
-
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-
-      #     call conda deactivate
-
-      # - name: Prepare igpu perf test for transformers 4.36 (2048-256 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_436.yaml
+          call conda deactivate
 
-      # - name: Test on igpu for transformers 4.36 (2048-256 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.36.2
+      - name: Prepare igpu perf test for transformers 4.43 (1024-128 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/1024-128_int4_fp16_443.yaml
 
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
+      - name: Test on igpu for transformers 4.43 (1024-128 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.43.1
+          pip install trl
 
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_436.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
 
-      #     call conda deactivate
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\1024-128_int4_fp16_443.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\1024-128_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
+          if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      # - name: Prepare igpu perf test for transformers 4.38 (2048-256 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_438.yaml
+          pip uninstall trl -y
+          call conda deactivate
 
-      # - name: Test on igpu for transformers 4.38 (2048-256 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.38.2
+      - name: Concat csv and generate html (1024-128 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate html-gen
 
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
+          cd python\llm\dev\benchmark\all-in-one
+          python ..\..\..\test\benchmark\concat_csv.py
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          del /q *test*.csv
+          move *.csv %CSV_SAVE_PATH%\1024-128_int4_fp16\
+          cd ..\..\..\test\benchmark
+          python csv_to_html.py -f %CSV_SAVE_PATH%\1024-128_int4_fp16\
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          move %CSV_SAVE_PATH%\1024-128_int4_fp16\*.html %CSV_SAVE_PATH%
 
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_438.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+          call conda deactivate
 
-      #     call conda deactivate
+      # 2048-256 int4+fp16
+      - name: Prepare igpu perf test (2048-256 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/1024-128/2048-256/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i 's/{today}_test4/{today}_test1/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16.yaml
 
-      # - name: Prepare igpu perf test for transformers 4.43 (2048-256 int4+fp16)
-      #   shell: bash
-      #   run: |
-      #     sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
-      #     sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_443.yaml
+      - name: Test on igpu (2048-256 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.41.2 trl
 
-      # - name: Test on igpu for transformers 4.43 (2048-256 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate igpu-perf
-      #     pip install transformers==4.43.1
-      #     pip install trl
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+          REM for llava
+          set TRANSFORMERS_OFFLINE=1
 
-      #     set SYCL_CACHE_PERSISTENT=1
-      #     set BIGDL_LLM_XMX_DISABLED=1
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
 
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_443.yaml config.yaml
-      #     set PYTHONIOENCODING=utf-8
-      #     python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
+          call conda deactivate
 
-      #     pip uninstall trl -y
-      #     call conda deactivate
+      - name: Prepare igpu perf test for transformers 4.36 (2048-256 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test1/{today}_test2/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_436.yaml
 
-      # # - name: Concat csv and generate html (2048-256 int4+fp16)
-      #   shell: cmd
-      #   run: |
-      #     call conda activate html-gen
+      - name: Test on igpu for transformers 4.36 (2048-256 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.41.2 trl
+
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_436.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test2
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+
+          call conda deactivate
+
+      - name: Prepare igpu perf test for transformers 4.38 (2048-256 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test2/{today}_test3/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_438.yaml
 
-      #     cd python\llm\dev\benchmark\all-in-one
-      #     python ..\..\..\test\benchmark\concat_csv.py
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     del /q *test*.csv
-      #     move *.csv %CSV_SAVE_PATH%\2048-256_int4_fp16\
-      #     cd ..\..\..\test\benchmark
-      #     python csv_to_html.py -f %CSV_SAVE_PATH%\2048-256_int4_fp16\
-      #     if %ERRORLEVEL% neq 0 (exit /b 1)
-      #     move %CSV_SAVE_PATH%\2048-256_int4_fp16\*.html %CSV_SAVE_PATH%
+      - name: Test on igpu for transformers 4.38 (2048-256 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.41.2 trl
 
-      #     call conda deactivate
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_438.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test3
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+
+          call conda deactivate
+
+      - name: Prepare igpu perf test for transformers 4.43 (2048-256 int4+fp16)
+        shell: bash
+        run: |
+          sed -i 's/{today}_test3/{today}_test4/g' python/llm/dev/benchmark/all-in-one/run.py
+          sed -i "s/path to your local model hub/$MODEL_HUB_DIR/g" python/llm/test/benchmark/igpu-perf/2048-256_int4_fp16_443.yaml
+
+      - name: Test on igpu for transformers 4.43 (2048-256 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate igpu-perf
+          pip install transformers==4.43.1
+          pip install trl
+
+          set SYCL_CACHE_PERSISTENT=1
+          set BIGDL_LLM_XMX_DISABLED=1
+
+          cd python\llm\dev\benchmark\all-in-one
+          move ..\..\..\test\benchmark\igpu-perf\2048-256_int4_fp16_443.yaml config.yaml
+          set PYTHONIOENCODING=utf-8
+          python run.py >> %CSV_SAVE_PATH%\2048-256_int4_fp16\log\%LOG_FILE% 2>&1
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          python ..\..\..\test\benchmark\igpu-perf\check_csv_results.py --yaml-file config.yaml --suffix test4
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+
+          pip uninstall trl -y
+          call conda deactivate
+
+      - name: Concat csv and generate html (2048-256 int4+fp16)
+        shell: cmd
+        run: |
+          call conda activate html-gen
+
+          cd python\llm\dev\benchmark\all-in-one
+          python ..\..\..\test\benchmark\concat_csv.py
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          del /q *test*.csv
+          move *.csv %CSV_SAVE_PATH%\2048-256_int4_fp16\
+          cd ..\..\..\test\benchmark
+          python csv_to_html.py -f %CSV_SAVE_PATH%\2048-256_int4_fp16\
+          if %ERRORLEVEL% neq 0 (exit /b 1)
+          move %CSV_SAVE_PATH%\2048-256_int4_fp16\*.html %CSV_SAVE_PATH%
+
+          call conda deactivate
 
       # 3072-384 int4+fp16
       - name: Prepare igpu perf test (3072-384 int4+fp16)
@@ -1021,7 +1022,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.37.0
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1048,7 +1049,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1073,7 +1074,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1144,7 +1145,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.37.0
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1171,7 +1172,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1242,7 +1243,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.37.0
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1269,7 +1270,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1294,7 +1295,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1364,7 +1365,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.37.0
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1391,7 +1392,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1416,7 +1417,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
index 72a25691151..0919d1349c7 100644
--- a/python/llm/dev/benchmark/all-in-one/run.py
+++ b/python/llm/dev/benchmark/all-in-one/run.py
@@ -2038,8 +2038,7 @@ def run_pipeline_parallel_gpu(repo_id,
     for api in conf.test_api:
         global csv_name
         csv_name = f'{current_dir}/{api}-results-{today}.csv'
-        print("-------------------- csv_name: {} --------------------".format(csv_name))
-        print(conf)
+
         try:
             line_counter = len(open(csv_name).readlines())
         except:
@@ -2071,6 +2070,8 @@ def run_pipeline_parallel_gpu(repo_id,
         print("-------------------- Results df:--------------------")
         print(df)
         print("-------------------- Results: {} --------------------".format(results))
+        print("-------------------- csv_name: {} --------------------".format(csv_name))
+        print(conf)
         if "pipeline" in api or "deepspeed" in api:
             if torch.distributed.get_rank() == 0:
                 df.index += max(line_counter - 1, 0)
diff --git a/python/llm/src/ipex_llm/utils/benchmark_util_4_29.py b/python/llm/src/ipex_llm/utils/benchmark_util_4_29.py
index d64631f1f4c..8e74b4507c5 100644
--- a/python/llm/src/ipex_llm/utils/benchmark_util_4_29.py
+++ b/python/llm/src/ipex_llm/utils/benchmark_util_4_29.py
@@ -2452,7 +2452,7 @@ def greedy_search(
                 last_token_time.append(end - st)
 
             # stop if we exceed the maximum length
-            if stopping_criteria(input_ids, scores):
+            if stopping_criteria(input_ids, scores)[0]:
                 this_peer_finished = True
 
             if this_peer_finished and not synced_gpus:
diff --git a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml
index edca0e7b67d..60202594cba 100644
--- a/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml
+++ b/python/llm/test/benchmark/igpu-perf/3072-384_int4_fp16.yaml
@@ -19,7 +19,7 @@ repo_id:
   - 'openbmb/MiniCPM-V-2_6'
 local_model_hub: 'path to your local model hub'
 warm_up: 1
-num_trials: 1
+num_trials: 3
 num_beams: 1 # default to greedy search
 low_bit: 'sym_int4' # default to use 'sym_int4' (i.e. symmetric int4)
 batch_size: 1 # default to 1

From f3256609dd2264fd74713ff62d94aecc36f78750 Mon Sep 17 00:00:00 2001
From: songhappy <guoqiongsong@gmail.com>
Date: Fri, 30 Aug 2024 13:22:41 -0700
Subject: [PATCH 5/6] for cpu

---
 .github/workflows/llm_performance_tests.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
index 220bf2fe0fa..44c3fb2ec7e 100644
--- a/.github/workflows/llm_performance_tests.yml
+++ b/.github/workflows/llm_performance_tests.yml
@@ -419,6 +419,8 @@ jobs:
           export https_proxy=${HTTPS_PROXY}
           source ipex-llm-init -t
           export OMP_NUM_THREADS=48
+          # upgrade for default transformers version
+          python -m pip install transformers==4.41.2, trl
           # hide time info
           sed -i 's/str(end - st)/"xxxxxx"/g' run.py
           python run.py
@@ -499,6 +501,8 @@ jobs:
           cd python/llm/dev/benchmark/all-in-one
           export http_proxy=${HTTP_PROXY}
           export https_proxy=${HTTPS_PROXY}
+          # upgrade for default transformers version
+          python -m pip install transformers==4.41.2, trl
           # hide time info
           sed -i 's/str(end - st)/"xxxxxx"/g' run.py
           python run.py

From 428e62b44a7d0bd7bcb170e5e2c1dc884ea39f2c Mon Sep 17 00:00:00 2001
From: songhappy <guoqiongsong@gmail.com>
Date: Wed, 4 Sep 2024 16:55:37 -0700
Subject: [PATCH 6/6] update

---
 python/llm/src/ipex_llm/transformers/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py
index dfbbaf003a6..873407fbddd 100644
--- a/python/llm/src/ipex_llm/transformers/models/llama.py
+++ b/python/llm/src/ipex_llm/transformers/models/llama.py
@@ -1579,7 +1579,7 @@ def llama_attention_forward_4_41_original(
                     past_key_value.key_cache[self.layer_idx] = key_states
                     past_key_value.value_cache[self.layer_idx] = value_states
 
-    if cache_position is not None:
+    if attention_mask is not None:
         new_attention_mask = attention_mask[:, :, :, 0:kv_seq_len]
     else:
         new_attention_mask = attention_mask