intel-analytics · liu-shaojun · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024
@@ -154,7 +154,7 @@ jobs:
           source /opt/intel/oneapi/setvars.sh
           export USE_XETLA=OFF
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2, trl
           cp python/llm/test/benchmark/arc-perf-transformers-436.yaml python/llm/dev/benchmark/all-in-one/config.yaml
           cd python/llm/dev/benchmark/all-in-one
           mkdir test_batch1
@@ -191,7 +191,7 @@ jobs:
           export USE_XETLA=OFF
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
           # upgrade for default transformers version
-          python -m pip install transformers==4.37.0
+          python -m pip install transformers==4.41.2, trl
           # batch_size 1
           cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml
           cd python/llm/dev/benchmark/all-in-one
@@ -223,7 +223,7 @@ jobs:
           export USE_XETLA=OFF
           export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
           # upgrade transformers for model Qwen/Qwen1.5-MoE-A2.7B-Chat
-          python -m pip install transformers==4.40.0
+          python -m pip install transformers==4.41.2
           python -m pip install trl
           # batch_size 1
           cp python/llm/test/benchmark/arc-perf-transformers-440.yaml python/llm/dev/benchmark/all-in-one/config.yaml
@@ -419,6 +419,8 @@ jobs:
           export https_proxy=${HTTPS_PROXY}
           source ipex-llm-init -t
           export OMP_NUM_THREADS=48
+          # upgrade for default transformers version
+          python -m pip install transformers==4.41.2, trl
           # hide time info
           sed -i 's/str(end - st)/"xxxxxx"/g' run.py
           python run.py
@@ -499,6 +501,8 @@ jobs:
           cd python/llm/dev/benchmark/all-in-one
           export http_proxy=${HTTP_PROXY}
           export https_proxy=${HTTPS_PROXY}
+          # upgrade for default transformers version
+          python -m pip install transformers==4.41.2, trl
           # hide time info
           sed -i 's/str(end - st)/"xxxxxx"/g' run.py
           python run.py
@@ -557,7 +561,7 @@ jobs:
           pip install --upgrade pip
           pip install --upgrade wheel
           pip install --upgrade omegaconf pandas
-          pip install --upgrade tiktoken einops transformers_stream_generator matplotlib
+          pip install --upgrade tiktoken einops transformers_stream_generator matplotlib trl
 
           cd python\llm
           python setup.py clean --all bdist_wheel --win
@@ -653,6 +657,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
+          pip install transformers==4.41.2 trl
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
           REM for llava
@@ -678,7 +683,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -703,7 +708,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -775,7 +780,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.37.0
+          pip install transformers==4.41.0 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -802,7 +807,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -827,7 +832,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -898,7 +903,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.37.0
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -925,7 +930,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -950,7 +955,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1021,7 +1026,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.37.0
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1048,7 +1053,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1073,7 +1078,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1144,7 +1149,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.37.0
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1171,7 +1176,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1242,7 +1247,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.37.0
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1269,7 +1274,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1294,7 +1299,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1364,7 +1369,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.37.0
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1391,7 +1396,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.36.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1
@@ -1416,7 +1421,7 @@ jobs:
         shell: cmd
         run: |
           call conda activate igpu-perf
-          pip install transformers==4.38.2
+          pip install transformers==4.41.2 trl
 
           set SYCL_CACHE_PERSISTENT=1
           set BIGDL_LLM_XMX_DISABLED=1

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
@@ -2038,6 +2038,7 @@ def run_pipeline_parallel_gpu(repo_id,
     for api in conf.test_api:
         global csv_name
         csv_name = f'{current_dir}/{api}-results-{today}.csv'
+
         try:
             line_counter = len(open(csv_name).readlines())
         except:
@@ -2049,6 +2050,9 @@ def run_pipeline_parallel_gpu(repo_id,
         for batch_size in batch_list:
             for model in conf.repo_id:
                 in_out_pairs = conf['in_out_pairs'].copy()
+                print("-------------------- Start running batch_size: {} --------------------".format(batch_size))
+                print("-------------------- Start running model: {} --------------------".format(model))
+                print("--------------------in_out_pairs: {}--------------------".format(in_out_pairs))
                 if excludes:
                     for in_out in conf['in_out_pairs']:
                         model_id_input = model + ':' + in_out.split('-')[0]
@@ -2059,9 +2063,15 @@ def run_pipeline_parallel_gpu(repo_id,
                     lookahead = True
                 run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
                       conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, lookahead, task, optimize_model)
+                print("-------------------- Finish running model: {} --------------------".format(model))
         df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
                                             'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
                                             'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype'])
+        print("-------------------- Results df:--------------------")
+        print(df)
+        print("-------------------- Results: {} --------------------".format(results))
+        print("-------------------- csv_name: {} --------------------".format(csv_name))
+        print(conf)
         if "pipeline" in api or "deepspeed" in api:
             if torch.distributed.get_rank() == 0:
                 df.index += max(line_counter - 1, 0)
@@ -2079,3 +2089,4 @@ def run_pipeline_parallel_gpu(repo_id,
                     df.to_csv(csv_name, mode='a', header=None, encoding='utf-8')
             line_counter += len(df.index)
         results = []
+
diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py
@@ -1579,7 +1579,7 @@ def llama_attention_forward_4_41_original(
                     past_key_value.key_cache[self.layer_idx] = key_states
                     past_key_value.value_cache[self.layer_idx] = value_states
 
-    if cache_position is not None:
+    if attention_mask is not None:
         new_attention_mask = attention_mask[:, :, :, 0:kv_seq_len]
     else:
         new_attention_mask = attention_mask

diff --git a/python/llm/src/ipex_llm/utils/benchmark_util_4_29.py b/python/llm/src/ipex_llm/utils/benchmark_util_4_29.py
@@ -2452,7 +2452,7 @@ def greedy_search(
                 last_token_time.append(end - st)
 
             # stop if we exceed the maximum length
-            if stopping_criteria(input_ids, scores):
+            if stopping_criteria(input_ids, scores)[0]:
                 this_peer_finished = True
 
             if this_peer_finished and not synced_gpus: