Skip to content

Commit

Permalink
small fix of imatrix (#12480)
Browse files Browse the repository at this point in the history
  • Loading branch information
rnwang04 authored Dec 3, 2024
1 parent ab01753 commit 598603b
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 10 deletions.
14 changes: 7 additions & 7 deletions python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def from_pretrained(cls, *args, **kwargs):
with torch.no_grad():
# Only mock quantization_group_size=0 for now
cls.load_convert_cpu(qtype, model, "cpu", modules_to_not_convert, 0,
*args, **kwargs)
imatrix_data, *args, **kwargs)
model = model.eval()
logger.info(f"Finish to convert model")
else:
Expand Down Expand Up @@ -223,7 +223,7 @@ def from_pretrained(cls, *args, **kwargs):
optimize_llm(model)
with torch.no_grad():
cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
quantization_group_size, imatrix_data=imatrix_data,
quantization_group_size, imatrix_data,
*args, **kwargs)
if hasattr(model, "llm"):
create_npu_kernels(model.llm)
Expand Down Expand Up @@ -333,12 +333,12 @@ def load_convert(cls, q_k, optimize_model, device, modules_to_not_convert,

@classmethod
def load_convert_cpu(cls, q_k, optimize_model, device, modules_to_not_convert,
group_size=0, *arg, **kwarg):
group_size=0, imatrix_data=None, *arg, **kwarg):
from ipex_llm.transformers.npu_models.convert import replace_with_DequantizedLinear

replace_with_DequantizedLinear(optimize_model, q_k, device=device,
modules_to_not_convert=modules_to_not_convert,
group_size=group_size)
group_size=group_size, imatrix=imatrix_data)

@classmethod
@patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
Expand Down Expand Up @@ -766,7 +766,7 @@ def optimize_npu_model(cls, *args, **kwargs):
optimize_llm_pre(model, qtype, mixed_precision,
quantization_group_size=quantization_group_size)
cls.load_convert_fp16(qtype, model.encoder, "cpu", modules_to_not_convert,
quantization_group_size, *args, **kwargs)
quantization_group_size, None, *args, **kwargs)
create_npu_kernels(model.encoder)
model = model.eval()
logger.info(f"Finish to convert model")
Expand All @@ -781,11 +781,11 @@ def optimize_npu_model(cls, *args, **kwargs):

@classmethod
def load_convert_fp16(cls, q_k, optimize_model, device, modules_to_not_convert,
group_size=0, *arg, **kwarg):
group_size=0, imatrix_data=None, *arg, **kwarg):
from ipex_llm.transformers.npu_models.xlm_mp import replace_with_FP16Linear
replace_with_FP16Linear(optimize_model, q_k, device=device,
modules_to_not_convert=modules_to_not_convert,
group_size=group_size)
group_size=group_size, imatrix=imatrix_data)

def encode(self,
sentences,
Expand Down
5 changes: 3 additions & 2 deletions python/llm/src/ipex_llm/transformers/npu_models/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,

@module_optimization
def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
group_size):
group_size, imatrix):
from ipex_llm.transformers.npu_models.linear import DequantizedLinear
from ipex_llm.transformers.low_bit_linear import ggml_convert_qtype
from ipex_llm.ggml.quantize import ggml_tensor_qtype
Expand All @@ -113,7 +113,8 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert,
enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0"
qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
iqtype, device=device,
enable_scale_search=enable_scale_search)
enable_scale_search=enable_scale_search,
imatrix=imatrix)
return DequantizedLinear(qweights, scale, layer.bias)


Expand Down
2 changes: 1 addition & 1 deletion python/llm/src/ipex_llm/transformers/npu_models/xlm_mp.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ def replace_with_Layernorm(layer, qtype=None, device='NPU',

@module_optimization
def replace_with_FP16Linear(layer, qtype, device, modules_to_not_convert,
group_size):
group_size, imatrix=None):
from ipex_llm.transformers.npu_models.linear import Linear
if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"):
return Linear(layer.weight, layer.bias)

0 comments on commit 598603b

Please sign in to comment.