diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index cef9e23414d..4e2e3b174e6 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -186,7 +186,7 @@ def from_pretrained(cls, *args, **kwargs): with torch.no_grad(): # Only mock quantization_group_size=0 for now cls.load_convert_cpu(qtype, model, "cpu", modules_to_not_convert, 0, - *args, **kwargs) + imatrix_data, *args, **kwargs) model = model.eval() logger.info(f"Finish to convert model") else: @@ -223,7 +223,7 @@ def from_pretrained(cls, *args, **kwargs): optimize_llm(model) with torch.no_grad(): cls.load_convert(qtype, model, "cpu", modules_to_not_convert, - quantization_group_size, imatrix_data=imatrix_data, + quantization_group_size, imatrix_data, *args, **kwargs) if hasattr(model, "llm"): create_npu_kernels(model.llm) @@ -333,12 +333,12 @@ def load_convert(cls, q_k, optimize_model, device, modules_to_not_convert, @classmethod def load_convert_cpu(cls, q_k, optimize_model, device, modules_to_not_convert, - group_size=0, *arg, **kwarg): + group_size=0, imatrix_data=None, *arg, **kwarg): from ipex_llm.transformers.npu_models.convert import replace_with_DequantizedLinear replace_with_DequantizedLinear(optimize_model, q_k, device=device, modules_to_not_convert=modules_to_not_convert, - group_size=group_size) + group_size=group_size, imatrix=imatrix_data) @classmethod @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import) @@ -766,7 +766,7 @@ def optimize_npu_model(cls, *args, **kwargs): optimize_llm_pre(model, qtype, mixed_precision, quantization_group_size=quantization_group_size) cls.load_convert_fp16(qtype, model.encoder, "cpu", modules_to_not_convert, - quantization_group_size, *args, **kwargs) + quantization_group_size, None, *args, **kwargs) create_npu_kernels(model.encoder) model = model.eval() logger.info(f"Finish to convert model") @@ -781,11 +781,11 @@ def optimize_npu_model(cls, *args, **kwargs): @classmethod def load_convert_fp16(cls, q_k, optimize_model, device, modules_to_not_convert, - group_size=0, *arg, **kwarg): + group_size=0, imatrix_data=None, *arg, **kwarg): from ipex_llm.transformers.npu_models.xlm_mp import replace_with_FP16Linear replace_with_FP16Linear(optimize_model, q_k, device=device, modules_to_not_convert=modules_to_not_convert, - group_size=group_size) + group_size=group_size, imatrix=imatrix_data) def encode(self, sentences, diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 398d32ecd6a..19d92935c55 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -104,7 +104,7 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert, @module_optimization def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert, - group_size): + group_size, imatrix): from ipex_llm.transformers.npu_models.linear import DequantizedLinear from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype from ipex_llm.ggml.quantize import ggml_tensor_qtype @@ -113,7 +113,8 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert, enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0" qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32), iqtype, device=device, - enable_scale_search=enable_scale_search) + enable_scale_search=enable_scale_search, + imatrix=imatrix) return DequantizedLinear(qweights, scale, layer.bias) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/xlm_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/xlm_mp.py index 53282918294..dd1cece70e9 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/xlm_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/xlm_mp.py @@ -721,7 +721,7 @@ def replace_with_Layernorm(layer, qtype=None, device='NPU', @module_optimization def replace_with_FP16Linear(layer, qtype, device, modules_to_not_convert, - group_size): + group_size, imatrix=None): from ipex_llm.transformers.npu_models.linear import Linear if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"): return Linear(layer.weight, layer.bias)