@@ -53,8 +53,8 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in
53
53
for i in range (num_trials + warm_up ):
54
54
st = time .perf_counter ()
55
55
if lookahead :
56
- output_ids = model .generate (input_ids , lookahead = 3 , do_sample = False , max_matching_ngram_size = 2 , max_new_tokens = out_len ,
57
- min_new_tokens = out_len , num_beams = num_beams )
56
+ output_ids = model .generate (input_ids , lookahead = 2 , do_sample = False , max_matching_ngram_size = 2 , max_new_tokens = out_len ,
57
+ min_new_tokens = out_len , num_beams = num_beams )
58
58
else :
59
59
output_ids = model .generate (input_ids , do_sample = False , max_new_tokens = out_len ,
60
60
min_new_tokens = out_len , num_beams = num_beams )
@@ -67,8 +67,8 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in
67
67
torch .xpu .empty_cache ()
68
68
actual_out_len = output_ids .shape [1 ] - actual_in_len
69
69
if i >= warm_up :
70
- if lookahead :
71
- result [in_out ].append ([model .first_token_time , (end - st - model .first_token_time )/ model .n_token_generated , 0 ,
70
+ if lookahead or os . environ . get ( "IPEX_LLM_PERFORMANCE_MODE" , None ) == "1" :
71
+ result [in_out ].append ([model .first_token_time , (end - st - model .first_token_time )/ ( model .n_token_generated - 1 ) , 0 ,
72
72
actual_in_len , actual_out_len , load_time , 0 ])
73
73
else :
74
74
result [in_out ].append ([model .first_cost , model .rest_cost_mean , model .encoder_time ,
@@ -510,7 +510,7 @@ def run_transformer_int4_gpu(repo_id,
510
510
load_time = end - st
511
511
print (">> loading of model costs {}s and {}GB" .format (load_time , torch .xpu .memory .memory_reserved ()/ (1024 ** 3 )))
512
512
513
- if not lookahead :
513
+ if not lookahead and os . environ . get ( "IPEX_LLM_PERFORMANCE_MODE" , None ) != "1" :
514
514
model = BenchmarkWrapper (model )
515
515
516
516
result = {}
0 commit comments