Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test For Transformers 4.41 Upgrade #11684

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 28 additions & 23 deletions .github/workflows/llm_performance_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ jobs:
source /opt/intel/oneapi/setvars.sh
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
pip install transformers==4.36.2
pip install transformers==4.41.2, trl
cp python/llm/test/benchmark/arc-perf-transformers-436.yaml python/llm/dev/benchmark/all-in-one/config.yaml
cd python/llm/dev/benchmark/all-in-one
mkdir test_batch1
Expand Down Expand Up @@ -191,7 +191,7 @@ jobs:
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
# upgrade for default transformers version
python -m pip install transformers==4.37.0
python -m pip install transformers==4.41.2, trl
# batch_size 1
cp python/llm/test/benchmark/arc-perf-transformers-437.yaml python/llm/dev/benchmark/all-in-one/config.yaml
cd python/llm/dev/benchmark/all-in-one
Expand Down Expand Up @@ -223,7 +223,7 @@ jobs:
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
# upgrade transformers for model Qwen/Qwen1.5-MoE-A2.7B-Chat
python -m pip install transformers==4.40.0
python -m pip install transformers==4.41.2
python -m pip install trl
# batch_size 1
cp python/llm/test/benchmark/arc-perf-transformers-440.yaml python/llm/dev/benchmark/all-in-one/config.yaml
Expand Down Expand Up @@ -419,6 +419,8 @@ jobs:
export https_proxy=${HTTPS_PROXY}
source ipex-llm-init -t
export OMP_NUM_THREADS=48
# upgrade for default transformers version
python -m pip install transformers==4.41.2, trl
# hide time info
sed -i 's/str(end - st)/"xxxxxx"/g' run.py
python run.py
Expand Down Expand Up @@ -499,6 +501,8 @@ jobs:
cd python/llm/dev/benchmark/all-in-one
export http_proxy=${HTTP_PROXY}
export https_proxy=${HTTPS_PROXY}
# upgrade for default transformers version
python -m pip install transformers==4.41.2, trl
# hide time info
sed -i 's/str(end - st)/"xxxxxx"/g' run.py
python run.py
Expand Down Expand Up @@ -557,7 +561,7 @@ jobs:
pip install --upgrade pip
pip install --upgrade wheel
pip install --upgrade omegaconf pandas
pip install --upgrade tiktoken einops transformers_stream_generator matplotlib
pip install --upgrade tiktoken einops transformers_stream_generator matplotlib trl

cd python\llm
python setup.py clean --all bdist_wheel --win
Expand Down Expand Up @@ -653,6 +657,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.41.2 trl
set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
REM for llava
Expand All @@ -678,7 +683,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.36.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -703,7 +708,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand Down Expand Up @@ -775,7 +780,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.37.0
pip install transformers==4.41.0 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -802,7 +807,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.36.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -827,7 +832,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand Down Expand Up @@ -898,7 +903,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.37.0
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -925,7 +930,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.36.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -950,7 +955,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand Down Expand Up @@ -1021,7 +1026,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.37.0
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -1048,7 +1053,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.36.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -1073,7 +1078,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand Down Expand Up @@ -1144,7 +1149,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.37.0
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -1171,7 +1176,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand Down Expand Up @@ -1242,7 +1247,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.37.0
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -1269,7 +1274,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.36.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -1294,7 +1299,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand Down Expand Up @@ -1364,7 +1369,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.37.0
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -1391,7 +1396,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.36.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand All @@ -1416,7 +1421,7 @@ jobs:
shell: cmd
run: |
call conda activate igpu-perf
pip install transformers==4.38.2
pip install transformers==4.41.2 trl

set SYCL_CACHE_PERSISTENT=1
set BIGDL_LLM_XMX_DISABLED=1
Expand Down
11 changes: 11 additions & 0 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2038,6 +2038,7 @@ def run_pipeline_parallel_gpu(repo_id,
for api in conf.test_api:
global csv_name
csv_name = f'{current_dir}/{api}-results-{today}.csv'

try:
line_counter = len(open(csv_name).readlines())
except:
Expand All @@ -2049,6 +2050,9 @@ def run_pipeline_parallel_gpu(repo_id,
for batch_size in batch_list:
for model in conf.repo_id:
in_out_pairs = conf['in_out_pairs'].copy()
print("-------------------- Start running batch_size: {} --------------------".format(batch_size))
print("-------------------- Start running model: {} --------------------".format(model))
print("--------------------in_out_pairs: {}--------------------".format(in_out_pairs))
if excludes:
for in_out in conf['in_out_pairs']:
model_id_input = model + ':' + in_out.split('-')[0]
Expand All @@ -2059,9 +2063,15 @@ def run_pipeline_parallel_gpu(repo_id,
lookahead = True
run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
conf['low_bit'], conf['cpu_embedding'], batch_size, streaming, use_fp16_torch_dtype, lookahead, task, optimize_model)
print("-------------------- Finish running model: {} --------------------".format(model))
df = pd.DataFrame(results, columns=['model', '1st token avg latency (ms)', '2+ avg latency (ms/token)', 'encoder time (ms)',
'input/output tokens', 'batch_size', 'actual input/output tokens', 'num_beams', 'low_bit', 'cpu_embedding',
'model loading time (s)', 'peak mem (GB)', 'streaming', 'use_fp16_torch_dtype'])
print("-------------------- Results df:--------------------")
print(df)
print("-------------------- Results: {} --------------------".format(results))
print("-------------------- csv_name: {} --------------------".format(csv_name))
print(conf)
if "pipeline" in api or "deepspeed" in api:
if torch.distributed.get_rank() == 0:
df.index += max(line_counter - 1, 0)
Expand All @@ -2079,3 +2089,4 @@ def run_pipeline_parallel_gpu(repo_id,
df.to_csv(csv_name, mode='a', header=None, encoding='utf-8')
line_counter += len(df.index)
results = []

2 changes: 1 addition & 1 deletion python/llm/src/ipex_llm/transformers/models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -1579,7 +1579,7 @@ def llama_attention_forward_4_41_original(
past_key_value.key_cache[self.layer_idx] = key_states
past_key_value.value_cache[self.layer_idx] = value_states

if cache_position is not None:
if attention_mask is not None:
new_attention_mask = attention_mask[:, :, :, 0:kv_seq_len]
else:
new_attention_mask = attention_mask
Expand Down
2 changes: 1 addition & 1 deletion python/llm/src/ipex_llm/utils/benchmark_util_4_29.py
Original file line number Diff line number Diff line change
Expand Up @@ -2452,7 +2452,7 @@ def greedy_search(
last_token_time.append(end - st)

# stop if we exceed the maximum length
if stopping_criteria(input_ids, scores):
if stopping_criteria(input_ids, scores)[0]:
this_peer_finished = True

if this_peer_finished and not synced_gpus:
Expand Down
Loading