@@ -195,7 +195,8 @@ def convert_llm(model: torch.nn.Module,
195
195
group_size : int ,
196
196
qtype : str ,
197
197
convert_model : bool = False ,
198
- save_directory : str = None ):
198
+ save_directory : str = None ,
199
+ fuse_layers : int = None ):
199
200
# whether to set layernorm weight as const
200
201
layernorm_const = os .environ .get ("IPEX_LLM_NPU_LAYERNORM_CONST" , "1" ) == "1"
201
202
if group_size == 0 :
@@ -216,7 +217,8 @@ def convert_llm(model: torch.nn.Module,
216
217
n_splits_linear ,
217
218
n_splits_down_proj ,
218
219
group_size ,
219
- save_directory )
220
+ save_directory ,
221
+ fuse_layers = fuse_layers )
220
222
return 0
221
223
if model .config .model_type == "llama" :
222
224
with tempfile .TemporaryDirectory () as temp_dir :
@@ -422,18 +424,22 @@ def convert_llm_for_deploy(model: torch.nn.Module,
422
424
n_splits_linear : int ,
423
425
n_splits_down_proj : int ,
424
426
group_size : int ,
425
- save_directory : str = None ):
427
+ save_directory : str = None ,
428
+ fuse_layers : int = None ):
426
429
os .mkdir (save_directory )
427
430
weight_dir = os .path .join (save_directory , "model_weights" )
428
431
os .mkdir (weight_dir )
429
432
layernorm_const = os .environ .get ("IPEX_LLM_NPU_LAYERNORM_CONST" , "1" ) == "1"
430
433
431
434
if model .config .model_type == "qwen2" :
432
- if model .config .hidden_size == 1536 :
433
- # Qwen2-1.5B-Instruct
434
- fused_layers = 1
435
+ if group_size == 0 :
436
+ if model .config .hidden_size == 1536 :
437
+ # Qwen2-1.5B-Instruct
438
+ fused_layers = 1 if fuse_layers is None else fuse_layers
439
+ else :
440
+ fused_layers = 2 if fuse_layers is None else fuse_layers
435
441
else :
436
- fused_layers = 2
442
+ fused_layers = len ( model . model . layers ) if fuse_layers is None else fuse_layers
437
443
update_dict = {"kv_len" : kv_len ,
438
444
"num_head" : model .model .layers [0 ].self_attn .num_heads ,
439
445
"head_dim" : model .model .layers [0 ].self_attn .head_dim ,
@@ -469,20 +475,31 @@ def convert_llm_for_deploy(model: torch.nn.Module,
469
475
embedding_post = False
470
476
cos_sin_input = False
471
477
use_prefill_sdp = False
472
- if model .config .vocab_size == 32000 :
473
- # for Llama2-7B
474
- fused_layers = 4
475
- use_prefill_sdp = True
476
- else :
477
- if model .config .intermediate_size == 8192 :
478
+ if group_size == 0 :
479
+ if model .config .intermediate_size == 11008 :
480
+ # for Llama2-7B
481
+ fused_layers = 4 if fuse_layers is None else fuse_layers
482
+ use_prefill_sdp = True
483
+ elif model .config .intermediate_size == 14336 :
484
+ # for Llama3-8B
485
+ fused_layers = 2 if fuse_layers is None else fuse_layers
486
+ use_prefill_sdp = True
487
+ elif not hasattr (model .model .layers [0 ].self_attn .rotary_emb , "cos_cached" ):
478
488
# llama3.2 1B & # llama3.2 3B
479
489
embedding_post = True
480
490
cos_sin_input = True
481
- fused_layers = 2
491
+ fused_layers = 2 if fuse_layers is None else fuse_layers
482
492
else :
483
- # for Llama3-8B
484
- fused_layers = 2
493
+ fused_layers = 2 if fuse_layers is None else fuse_layers
494
+ else :
495
+ if model .config .intermediate_size in [11008 , 14336 ]:
496
+ # for Llama2-7B & Llama3-8B
485
497
use_prefill_sdp = True
498
+ elif not hasattr (model .model .layers [0 ].self_attn .rotary_emb , "cos_cached" ):
499
+ # llama3.2 1B & # llama3.2 3B
500
+ embedding_post = True
501
+ cos_sin_input = True
502
+ fused_layers = len (model .model .layers ) if fuse_layers is None else fuse_layers
486
503
update_dict = {"kv_len" : kv_len ,
487
504
"num_head" : model .model .layers [0 ].self_attn .num_heads ,
488
505
"head_dim" : model .model .layers [0 ].self_attn .head_dim ,
@@ -518,7 +535,10 @@ def convert_llm_for_deploy(model: torch.nn.Module,
518
535
save_directory , weight_dir , transpose_value_cache , max_prompt_len ,
519
536
group_size , layernorm_const , "prefill" )
520
537
elif model .config .model_type == "minicpm" :
521
- fused_layers = 4
538
+ if group_size == 0 :
539
+ fused_layers = 4 if fuse_layers is None else fuse_layers
540
+ else :
541
+ fused_layers = len (model .model .layers ) if fuse_layers is None else fuse_layers
522
542
update_dict = {"kv_len" : kv_len ,
523
543
"num_head" : model .model .layers [0 ].self_attn .num_heads ,
524
544
"head_dim" : model .model .layers [0 ].self_attn .head_dim ,
0 commit comments