diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index eb684bce715..fc3c0879dd1 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -136,6 +136,7 @@ def from_pretrained(cls, *args, **kwargs): mock_device = kwargs.pop('device', None) # For mock on CPU convert_model = kwargs.pop('convert_model', False) save_directory = kwargs.pop('save_directory', None) + fuse_layers = kwargs.pop('fuse_layers', None) invalidInputError( quantization_group_size in [0, 32, 64, 128], @@ -204,6 +205,7 @@ def from_pretrained(cls, *args, **kwargs): "transpose_value_cache": transpose_value_cache, "convert_model": convert_model, "save_directory": save_directory, + "fuse_layers": fuse_layers } model = cls.optimize_npu_model(*args, **optimize_kwargs) else: @@ -243,6 +245,7 @@ def optimize_npu_model(cls, *args, **kwargs): transpose_value_cache = kwargs.pop("transpose_value_cache", True) convert_model = kwargs.pop('convert_model', False) save_directory = kwargs.pop('save_directory', None) + fuse_layers = kwargs.pop('fuse_layers', None) if hasattr(model, "llm"): llm = model.llm @@ -282,7 +285,8 @@ def optimize_npu_model(cls, *args, **kwargs): group_size=quantization_group_size, qtype=qtype, convert_model=convert_model, - save_directory=save_directory) + save_directory=save_directory, + fuse_layers=fuse_layers) model.save_low_bit = types.MethodType(save_low_bit, model) return model diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index e060e3a30ed..1c736623975 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -195,7 +195,8 @@ def convert_llm(model: torch.nn.Module, group_size: int, qtype: str, convert_model: bool=False, - save_directory: str=None): + save_directory: str=None, + fuse_layers: int=None): # whether to set layernorm weight as const layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" if group_size == 0: @@ -216,7 +217,8 @@ def convert_llm(model: torch.nn.Module, n_splits_linear, n_splits_down_proj, group_size, - save_directory) + save_directory, + fuse_layers=fuse_layers) return 0 if model.config.model_type == "llama": with tempfile.TemporaryDirectory() as temp_dir: @@ -422,18 +424,22 @@ def convert_llm_for_deploy(model: torch.nn.Module, n_splits_linear: int, n_splits_down_proj: int, group_size: int, - save_directory: str=None): + save_directory: str=None, + fuse_layers: int=None): os.mkdir(save_directory) weight_dir = os.path.join(save_directory, "model_weights") os.mkdir(weight_dir) layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" if model.config.model_type == "qwen2": - if model.config.hidden_size == 1536: - # Qwen2-1.5B-Instruct - fused_layers = 1 + if group_size == 0: + if model.config.hidden_size == 1536: + # Qwen2-1.5B-Instruct + fused_layers = 1 if fuse_layers is None else fuse_layers + else: + fused_layers = 2 if fuse_layers is None else fuse_layers else: - fused_layers = 2 + fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, "head_dim": model.model.layers[0].self_attn.head_dim, @@ -469,20 +475,31 @@ def convert_llm_for_deploy(model: torch.nn.Module, embedding_post = False cos_sin_input = False use_prefill_sdp = False - if model.config.vocab_size == 32000: - # for Llama2-7B - fused_layers = 4 - use_prefill_sdp = True - else: - if model.config.intermediate_size == 8192: + if group_size == 0: + if model.config.intermediate_size == 11008: + # for Llama2-7B + fused_layers = 4 if fuse_layers is None else fuse_layers + use_prefill_sdp = True + elif model.config.intermediate_size == 14336: + # for Llama3-8B + fused_layers = 2 if fuse_layers is None else fuse_layers + use_prefill_sdp = True + elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"): # llama3.2 1B & # llama3.2 3B embedding_post = True cos_sin_input = True - fused_layers = 2 + fused_layers = 2 if fuse_layers is None else fuse_layers else: - # for Llama3-8B - fused_layers = 2 + fused_layers = 2 if fuse_layers is None else fuse_layers + else: + if model.config.intermediate_size in [11008, 14336]: + # for Llama2-7B & Llama3-8B use_prefill_sdp = True + elif not hasattr(model.model.layers[0].self_attn.rotary_emb, "cos_cached"): + # llama3.2 1B & # llama3.2 3B + embedding_post = True + cos_sin_input = True + fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, "head_dim": model.model.layers[0].self_attn.head_dim, @@ -518,7 +535,10 @@ def convert_llm_for_deploy(model: torch.nn.Module, save_directory, weight_dir, transpose_value_cache, max_prompt_len, group_size, layernorm_const, "prefill") elif model.config.model_type == "minicpm": - fused_layers = 4 + if group_size == 0: + fused_layers = 4 if fuse_layers is None else fuse_layers + else: + fused_layers = len(model.model.layers) if fuse_layers is None else fuse_layers update_dict = {"kv_len": kv_len, "num_head": model.model.layers[0].self_attn.num_heads, "head_dim": model.model.layers[0].self_attn.head_dim,