Skip to content

Commit 490bb0c

Browse files
authored
[NPU] update fused layers for GW (#12459)
* update fused layers for GW * fix * fix llama condition for glm model * update
1 parent 1b533a1 commit 490bb0c

File tree

2 files changed

+42
-18
lines changed

2 files changed

+42
-18
lines changed

python/llm/src/ipex_llm/transformers/npu_model.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def from_pretrained(cls, *args, **kwargs):
136136
mock_device = kwargs.pop('device', None) # For mock on CPU
137137
convert_model = kwargs.pop('convert_model', False)
138138
save_directory = kwargs.pop('save_directory', None)
139+
fuse_layers = kwargs.pop('fuse_layers', None)
139140

140141
invalidInputError(
141142
quantization_group_size in [0, 32, 64, 128],
@@ -204,6 +205,7 @@ def from_pretrained(cls, *args, **kwargs):
204205
"transpose_value_cache": transpose_value_cache,
205206
"convert_model": convert_model,
206207
"save_directory": save_directory,
208+
"fuse_layers": fuse_layers
207209
}
208210
model = cls.optimize_npu_model(*args, **optimize_kwargs)
209211
else:
@@ -243,6 +245,7 @@ def optimize_npu_model(cls, *args, **kwargs):
243245
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
244246
convert_model = kwargs.pop('convert_model', False)
245247
save_directory = kwargs.pop('save_directory', None)
248+
fuse_layers = kwargs.pop('fuse_layers', None)
246249

247250
if hasattr(model, "llm"):
248251
llm = model.llm
@@ -282,7 +285,8 @@ def optimize_npu_model(cls, *args, **kwargs):
282285
group_size=quantization_group_size,
283286
qtype=qtype,
284287
convert_model=convert_model,
285-
save_directory=save_directory)
288+
save_directory=save_directory,
289+
fuse_layers=fuse_layers)
286290
model.save_low_bit = types.MethodType(save_low_bit, model)
287291
return model
288292

python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,8 @@ def convert_llm(model: torch.nn.Module,
195195
group_size: int,
196196
qtype: str,
197197
convert_model: bool=False,
198-
save_directory: str=None):
198+
save_directory: str=None,
199+
fuse_layers: int=None):
199200
# whether to set layernorm weight as const
200201
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
201202
if group_size == 0:
@@ -216,7 +217,8 @@ def convert_llm(model: torch.nn.Module,
216217
n_splits_linear,
217218
n_splits_down_proj,
218219
group_size,
219-
save_directory)
220+
save_directory,
221+
fuse_layers=fuse_layers)
220222
return 0
221223
if model.config.model_type == "llama":
222224
with tempfile.TemporaryDirectory() as temp_dir:
@@ -422,18 +424,22 @@ def convert_llm_for_deploy(model: torch.nn.Module,
422424
n_splits_linear: int,
423425
n_splits_down_proj: int,
424426
group_size: int,
425-
save_directory: str=None):
427+
save_directory: str=None,
428+
fuse_layers: int=None):
426429
os.mkdir(save_directory)
427430
weight_dir = os.path.join(save_directory, "model_weights")
428431
os.mkdir(weight_dir)
429432
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
430433

431434
if model.config.model_type == "qwen2":
432-
if model.config.hidden_size == 1536:
433-
# Qwen2-1.5B-Instruct
434-
fused_layers = 1
435+
if group_size == 0:
436+
if model.config.hidden_size == 1536:
437+
# Qwen2-1.5B-Instruct
438+
fused_layers = 1 if fuse_layers is None else fuse_layers
439+
else:
440+
fused_layers = 2 if fuse_layers is None else fuse_layers
435441
else:
436-
fused_layers = 2
442+
fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
437443
update_dict = {"kv_len": kv_len,
438444
"num_head": model.model.layers[0].self_attn.num_heads,
439445
"head_dim": model.model.layers[0].self_attn.head_dim,
@@ -469,20 +475,31 @@ def convert_llm_for_deploy(model: torch.nn.Module,
469475
embedding_post = False
470476
cos_sin_input = False
471477
use_prefill_sdp = False
472-
if model.config.vocab_size == 32000:
473-
# for Llama2-7B
474-
fused_layers = 4
475-
use_prefill_sdp = True
476-
else:
477-
if model.config.intermediate_size == 8192:
478+
if group_size == 0:
479+
if model.config.intermediate_size == 11008:
480+
# for Llama2-7B
481+
fused_layers = 4 if fuse_layers is None else fuse_layers
482+
use_prefill_sdp = True
483+
elif model.config.intermediate_size == 14336:
484+
# for Llama3-8B
485+
fused_layers = 2 if fuse_layers is None else fuse_layers
486+
use_prefill_sdp = True
487+
elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
478488
# llama3.2 1B & # llama3.2 3B
479489
embedding_post = True
480490
cos_sin_input = True
481-
fused_layers = 2
491+
fused_layers = 2 if fuse_layers is None else fuse_layers
482492
else:
483-
# for Llama3-8B
484-
fused_layers = 2
493+
fused_layers = 2 if fuse_layers is None else fuse_layers
494+
else:
495+
if model.config.intermediate_size in [11008, 14336]:
496+
# for Llama2-7B & Llama3-8B
485497
use_prefill_sdp = True
498+
elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
499+
# llama3.2 1B & # llama3.2 3B
500+
embedding_post = True
501+
cos_sin_input = True
502+
fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
486503
update_dict = {"kv_len": kv_len,
487504
"num_head": model.model.layers[0].self_attn.num_heads,
488505
"head_dim": model.model.layers[0].self_attn.head_dim,
@@ -518,7 +535,10 @@ def convert_llm_for_deploy(model: torch.nn.Module,
518535
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
519536
group_size, layernorm_const, "prefill")
520537
elif model.config.model_type == "minicpm":
521-
fused_layers = 4
538+
if group_size == 0:
539+
fused_layers = 4 if fuse_layers is None else fuse_layers
540+
else:
541+
fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
522542
update_dict = {"kv_len": kv_len,
523543
"num_head": model.model.layers[0].self_attn.num_heads,
524544
"head_dim": model.model.layers[0].self_attn.head_dim,

0 commit comments

Comments
 (0)