From 06bd01e00eea9694cf4d2f42d60fb2747a10b1a0 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 28 Nov 2024 10:51:52 +0800 Subject: [PATCH 1/4] update fused layers for GW --- .../src/ipex_llm/transformers/npu_model.py | 6 +- .../npu_pipeline_model/convert_pipeline.py | 63 +++++++++++++------ 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index eb684bce715..fc3c0879dd1 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -136,6 +136,7 @@ def from_pretrained(cls, *args, **kwargs): mock_device = kwargs.pop('device', None) # For mock on CPU convert_model = kwargs.pop('convert_model', False) save_directory = kwargs.pop('save_directory', None) + fuse_layers = kwargs.pop('fuse_layers', None) invalidInputError( quantization_group_size in [0, 32, 64, 128], @@ -204,6 +205,7 @@ def from_pretrained(cls, *args, **kwargs): "transpose_value_cache": transpose_value_cache, "convert_model": convert_model, "save_directory": save_directory, + "fuse_layers": fuse_layers } model = cls.optimize_npu_model(*args, **optimize_kwargs) else: @@ -243,6 +245,7 @@ def optimize_npu_model(cls, *args, **kwargs): transpose_value_cache = kwargs.pop("transpose_value_cache", True) convert_model = kwargs.pop('convert_model', False) save_directory = kwargs.pop('save_directory', None) + fuse_layers = kwargs.pop('fuse_layers', None) if hasattr(model, "llm"): llm = model.llm @@ -282,7 +285,8 @@ def optimize_npu_model(cls, *args, **kwargs): group_size=quantization_group_size, qtype=qtype, convert_model=convert_model, - save_directory=save_directory) + save_directory=save_directory, + fuse_layers=fuse_layers) model.save_low_bit = types.MethodType(save_low_bit, model) return model diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index e060e3a30ed..7ccd1ae9615 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -195,7 +195,8 @@ def convert_llm(model: torch.nn.Module, group_size: int, qtype: str, convert_model: bool=False, - save_directory: str=None): + save_directory: str=None, + fuse_layers: int=None): # whether to set layernorm weight as const layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" if group_size == 0: @@ -216,7 +217,8 @@ def convert_llm(model: torch.nn.Module, n_splits_linear, n_splits_down_proj, group_size, - save_directory) + save_directory, + fuse_layers=fuse_layers) return 0 if model.config.model_type == "llama": with tempfile.TemporaryDirectory() as temp_dir: @@ -422,18 +424,22 @@ def convert_llm_for_deploy(model: torch.nn.Module, n_splits_linear: int, n_splits_down_proj: int, group_size: int, - save_directory: str=None): + save_directory: str=None, + fuse_layers: int=None): os.mkdir(save_directory) weight_dir = os.path.join(save_directory, "model_weights") os.mkdir(weight_dir) layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" if model.config.model_type == "qwen2": - if model.config.hidden_size == 1536: - # Qwen2-1.5B-Instruct - fused_layers = 1 + if group_size == 0: + if model.config.hidden_size == 1536: + # Qwen2-1.5B-Instruct + fused_layers = 1 if fuse_layers is None else fuse_layers + else: + fused_layers = 2 if fuse_layers is None else fuse_layers else: - fused_layers = 2 + fuse_layers = len(model.model.layers) if fuse_layers is None else fuse_layers update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, "head_dim": model.model.layers[0].self_attn.head_dim, @@ -469,20 +475,34 @@ def convert_llm_for_deploy(model: torch.nn.Module, embedding_post = False cos_sin_input = False use_prefill_sdp = False - if model.config.vocab_size == 32000: - # for Llama2-7B - fused_layers = 4 - use_prefill_sdp = True - else: - if model.config.intermediate_size == 8192: - # llama3.2 1B & # llama3.2 3B - embedding_post = True - cos_sin_input = True - fused_layers = 2 + if group_size == 0: + if model.config.vocab_size == 32000: + # for Llama2-7B + fused_layers = 4 if fuse_layers is None else fuse_layers + use_prefill_sdp = True else: - # for Llama3-8B - fused_layers = 2 + if model.config.intermediate_size == 8192: + # llama3.2 1B & # llama3.2 3B + embedding_post = True + cos_sin_input = True + fused_layers = 2 if fuse_layers is None else fuse_layers + else: + # for Llama3-8B + fused_layers = 2 if fuse_layers is None else fuse_layers + use_prefill_sdp = True + else: + if model.config.vocab_size == 32000: + # for Llama2-7B use_prefill_sdp = True + else: + if model.config.intermediate_size == 8192: + # llama3.2 1B & # llama3.2 3B + embedding_post = True + cos_sin_input = True + else: + # for Llama3-8B + use_prefill_sdp = True + fuse_layers = len(model.model.layers) if fuse_layers is None else fuse_layers update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, "head_dim": model.model.layers[0].self_attn.head_dim, @@ -518,7 +538,10 @@ def convert_llm_for_deploy(model: torch.nn.Module, save_directory, weight_dir, transpose_value_cache, max_prompt_len, group_size, layernorm_const, "prefill") elif model.config.model_type == "minicpm": - fused_layers = 4 + if group_size == 0: + fused_layers = 4 if fuse_layers is None else fuse_layers + else: + fuse_layers = len(model.model.layers) if fuse_layers is None else fuse_layers update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, "head_dim": model.model.layers[0].self_attn.head_dim, From 2f868330f5e598dea2dd2fa1cd0894d61af0e9d3 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 28 Nov 2024 10:56:11 +0800 Subject: [PATCH 2/4] fix --- .../transformers/npu_pipeline_model/convert_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 7ccd1ae9615..4f8421fdfbc 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -439,7 +439,7 @@ def convert_llm_for_deploy(model: torch.nn.Module, else: fused_layers = 2 if fuse_layers is None else fuse_layers else: - fuse_layers = len(model.model.layers) if fuse_layers is None else fuse_layers + fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, "head_dim": model.model.layers[0].self_attn.head_dim, @@ -502,7 +502,7 @@ def convert_llm_for_deploy(model: torch.nn.Module, else: # for Llama3-8B use_prefill_sdp = True - fuse_layers = len(model.model.layers) if fuse_layers is None else fuse_layers + fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, "head_dim": model.model.layers[0].self_attn.head_dim, @@ -541,7 +541,7 @@ def convert_llm_for_deploy(model: torch.nn.Module, if group_size == 0: fused_layers = 4 if fuse_layers is None else fuse_layers else: - fuse_layers = len(model.model.layers) if fuse_layers is None else fuse_layers + fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, "head_dim": model.model.layers[0].self_attn.head_dim, From 7af67fb4202f8357d5780bf4ec2fbaac3708df51 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 28 Nov 2024 16:04:42 +0800 Subject: [PATCH 3/4] fix llama condition for glm model --- .../npu_pipeline_model/convert_pipeline.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 4f8421fdfbc..b4c2d9b4f4e 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -476,32 +476,32 @@ def convert_llm_for_deploy(model: torch.nn.Module, cos_sin_input = False use_prefill_sdp = False if group_size == 0: - if model.config.vocab_size == 32000: + if model.config.intermediate_size == 11008: # for Llama2-7B fused_layers = 4 if fuse_layers is None else fuse_layers use_prefill_sdp = True + elif model.config.intermediate_size == 14336: + # for Llama3-8B + fused_layers = 2 if fuse_layers is None else fuse_layers + use_prefill_sdp = True + elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"): + # llama3.2 1B & # llama3.2 3B + embedding_post = True + cos_sin_input = True + fused_layers = 2 if fuse_layers is None else fuse_layers else: - if model.config.intermediate_size == 8192: - # llama3.2 1B & # llama3.2 3B - embedding_post = True - cos_sin_input = True - fused_layers = 2 if fuse_layers is None else fuse_layers - else: - # for Llama3-8B - fused_layers = 2 if fuse_layers is None else fuse_layers - use_prefill_sdp = True + fused_layers = 2 if fuse_layers is None else fuse_layers else: - if model.config.vocab_size == 32000: + if model.config.intermediate_size == 11008: # for Llama2-7B use_prefill_sdp = True - else: - if model.config.intermediate_size == 8192: - # llama3.2 1B & # llama3.2 3B - embedding_post = True - cos_sin_input = True - else: - # for Llama3-8B - use_prefill_sdp = True + elif model.config.intermediate_size == 14336: + # for Llama3-8B + use_prefill_sdp = True + elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"): + # llama3.2 1B & # llama3.2 3B + embedding_post = True + cos_sin_input = True fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, From 1e004aedab11c903a6606e585ebc6cbdd0b17ac7 Mon Sep 17 00:00:00 2001 From: rnwang04 Date: Thu, 28 Nov 2024 16:05:16 +0800 Subject: [PATCH 4/4] update --- .../transformers/npu_pipeline_model/convert_pipeline.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index b4c2d9b4f4e..1c736623975 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -492,11 +492,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, else: fused_layers = 2 if fuse_layers is None else fuse_layers else: - if model.config.intermediate_size == 11008: - # for Llama2-7B - use_prefill_sdp = True - elif model.config.intermediate_size == 14336: - # for Llama3-8B + if model.config.intermediate_size in [11008, 14336]: + # for Llama2-7B & Llama3-8B use_prefill_sdp = True elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"): # llama3.2 1B & # llama3.2 3B