Skip to content

Commit

Permalink
[NPU] update fused layers for GW (#12459)
Browse files Browse the repository at this point in the history
* update fused layers for GW

* fix

* fix llama condition for glm model

* update
  • Loading branch information
rnwang04 authored Nov 28, 2024
1 parent 1b533a1 commit 490bb0c
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 18 deletions.
6 changes: 5 additions & 1 deletion python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def from_pretrained(cls, *args, **kwargs):
mock_device = kwargs.pop('device', None) # For mock on CPU
convert_model = kwargs.pop('convert_model', False)
save_directory = kwargs.pop('save_directory', None)
fuse_layers = kwargs.pop('fuse_layers', None)

invalidInputError(
quantization_group_size in [0, 32, 64, 128],
Expand Down Expand Up @@ -204,6 +205,7 @@ def from_pretrained(cls, *args, **kwargs):
"transpose_value_cache": transpose_value_cache,
"convert_model": convert_model,
"save_directory": save_directory,
"fuse_layers": fuse_layers
}
model = cls.optimize_npu_model(*args, **optimize_kwargs)
else:
Expand Down Expand Up @@ -243,6 +245,7 @@ def optimize_npu_model(cls, *args, **kwargs):
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
convert_model = kwargs.pop('convert_model', False)
save_directory = kwargs.pop('save_directory', None)
fuse_layers = kwargs.pop('fuse_layers', None)

if hasattr(model, "llm"):
llm = model.llm
Expand Down Expand Up @@ -282,7 +285,8 @@ def optimize_npu_model(cls, *args, **kwargs):
group_size=quantization_group_size,
qtype=qtype,
convert_model=convert_model,
save_directory=save_directory)
save_directory=save_directory,
fuse_layers=fuse_layers)
model.save_low_bit = types.MethodType(save_low_bit, model)
return model

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,8 @@ def convert_llm(model: torch.nn.Module,
group_size: int,
qtype: str,
convert_model: bool=False,
save_directory: str=None):
save_directory: str=None,
fuse_layers: int=None):
# whether to set layernorm weight as const
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"
if group_size == 0:
Expand All @@ -216,7 +217,8 @@ def convert_llm(model: torch.nn.Module,
n_splits_linear,
n_splits_down_proj,
group_size,
save_directory)
save_directory,
fuse_layers=fuse_layers)
return 0
if model.config.model_type == "llama":
with tempfile.TemporaryDirectory() as temp_dir:
Expand Down Expand Up @@ -422,18 +424,22 @@ def convert_llm_for_deploy(model: torch.nn.Module,
n_splits_linear: int,
n_splits_down_proj: int,
group_size: int,
save_directory: str=None):
save_directory: str=None,
fuse_layers: int=None):
os.mkdir(save_directory)
weight_dir = os.path.join(save_directory, "model_weights")
os.mkdir(weight_dir)
layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1"

if model.config.model_type == "qwen2":
if model.config.hidden_size == 1536:
# Qwen2-1.5B-Instruct
fused_layers = 1
if group_size == 0:
if model.config.hidden_size == 1536:
# Qwen2-1.5B-Instruct
fused_layers = 1 if fuse_layers is None else fuse_layers
else:
fused_layers = 2 if fuse_layers is None else fuse_layers
else:
fused_layers = 2
fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
update_dict = {"kv_len": kv_len,
"num_head": model.model.layers[0].self_attn.num_heads,
"head_dim": model.model.layers[0].self_attn.head_dim,
Expand Down Expand Up @@ -469,20 +475,31 @@ def convert_llm_for_deploy(model: torch.nn.Module,
embedding_post = False
cos_sin_input = False
use_prefill_sdp = False
if model.config.vocab_size == 32000:
# for Llama2-7B
fused_layers = 4
use_prefill_sdp = True
else:
if model.config.intermediate_size == 8192:
if group_size == 0:
if model.config.intermediate_size == 11008:
# for Llama2-7B
fused_layers = 4 if fuse_layers is None else fuse_layers
use_prefill_sdp = True
elif model.config.intermediate_size == 14336:
# for Llama3-8B
fused_layers = 2 if fuse_layers is None else fuse_layers
use_prefill_sdp = True
elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
# llama3.2 1B & # llama3.2 3B
embedding_post = True
cos_sin_input = True
fused_layers = 2
fused_layers = 2 if fuse_layers is None else fuse_layers
else:
# for Llama3-8B
fused_layers = 2
fused_layers = 2 if fuse_layers is None else fuse_layers
else:
if model.config.intermediate_size in [11008, 14336]:
# for Llama2-7B & Llama3-8B
use_prefill_sdp = True
elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"):
# llama3.2 1B & # llama3.2 3B
embedding_post = True
cos_sin_input = True
fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
update_dict = {"kv_len": kv_len,
"num_head": model.model.layers[0].self_attn.num_heads,
"head_dim": model.model.layers[0].self_attn.head_dim,
Expand Down Expand Up @@ -518,7 +535,10 @@ def convert_llm_for_deploy(model: torch.nn.Module,
save_directory, weight_dir, transpose_value_cache, max_prompt_len,
group_size, layernorm_const, "prefill")
elif model.config.model_type == "minicpm":
fused_layers = 4
if group_size == 0:
fused_layers = 4 if fuse_layers is None else fuse_layers
else:
fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers
update_dict = {"kv_len": kv_len,
"num_head": model.model.layers[0].self_attn.num_heads,
"head_dim": model.model.layers[0].self_attn.head_dim,
Expand Down

0 comments on commit 490bb0c

Please sign in to comment.