From 49ab8974fab93b369ea2f089a5d32bb6a334c78d Mon Sep 17 00:00:00 2001 From: Ruonan Wang Date: Thu, 5 Dec 2024 01:40:36 -0800 Subject: [PATCH] [NPU] initial support of `asym_int4_rtn` (#12484) * initiail support of q4_1 * fix * fix * update * update min to Z1 * update * fix * update * fix style * fix * support qwen2 optimize_model=True mp version * temp save * fix * fix style * replace min with zero * support split linear for q4_1 * fix lm_head with mixed_precision=True * fix style * revert test code * add down proj back for q4_0 * remove print --- python/llm/src/ipex_llm/ggml/quantize.py | 1 + .../ipex_llm/transformers/low_bit_linear.py | 16 ++- .../src/ipex_llm/transformers/npu_model.py | 8 +- .../transformers/npu_models/convert.py | 28 ++++- .../transformers/npu_models/convert_mp.py | 17 ++- .../transformers/npu_models/linear.py | 19 +++- .../transformers/npu_models/lm_head.py | 35 ++++-- .../transformers/npu_models/mp_models_base.py | 51 ++++++--- .../transformers/npu_models/qwen2_mp.py | 44 +++++-- .../transformers/npu_pipeline_model/common.py | 2 + .../npu_pipeline_model/convert_pipeline.py | 17 ++- .../transformers/npu_pipeline_model/qwen.py | 107 ++++++++++++++---- 12 files changed, 264 insertions(+), 81 deletions(-) diff --git a/python/llm/src/ipex_llm/ggml/quantize.py b/python/llm/src/ipex_llm/ggml/quantize.py index 76702e88117..a95e3464e32 100644 --- a/python/llm/src/ipex_llm/ggml/quantize.py +++ b/python/llm/src/ipex_llm/ggml/quantize.py @@ -52,6 +52,7 @@ "fp6_k": 30, "sym_int4_rtn": 31, "sym_int8_rtn": 32, + "asym_int4_rtn": 33, } # mixed precison from llama.cpp diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index 82fbdf6f506..ed44140d708 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -84,8 +84,10 @@ FP6_K = ggml_tensor_qtype["fp6_k"] SYM_INT4_RTN = ggml_tensor_qtype["sym_int4_rtn"] SYM_INT8_RTN = ggml_tensor_qtype["sym_int8_rtn"] +ASYM_INT4_RTN = ggml_tensor_qtype["asym_int4_rtn"] RTN_DTYPE = { SYM_INT4_RTN: torch.uint8, + ASYM_INT4_RTN: torch.uint8, SYM_INT8_RTN: torch.int8, } @@ -223,12 +225,16 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, f"Last dim of input tensor must be multiple of {QK}") dst_size = (n // QK) * block_size_in_bytes - if qtype in [SYM_INT8_RTN, SYM_INT4_RTN]: + if qtype in [SYM_INT8_RTN, SYM_INT4_RTN, ASYM_INT4_RTN]: dst_tensor = torch.empty(dst_size, dtype=RTN_DTYPE[qtype], device=device) dst_tensor = dst_tensor.reshape(tensor.shape[0], tensor.shape[-1] // QK) - scale = torch.empty(n // k, dtype=torch.float32, - device=device) + if qtype == ASYM_INT4_RTN: + scale = torch.empty((n // k) * 2, dtype=torch.float32, + device=device) + else: + scale = torch.empty(n // k, dtype=torch.float32, + device=device) elif qtype == NF4: # Deepspeed zero3 requires unified dtype, # thus here uses bfloat16 consistent to other layers @@ -244,7 +250,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, dst = ctypes.c_void_p(dst_tensor.data.data_ptr()) hist = (ctypes.c_int64 * 16)() if qtype not in [IQ2_XXS, IQ2_XS, Q2_K, IQ1_S, Q4_K, Q6_K, Q5_K, FP6_K]: - if qtype in [SYM_INT8_RTN, SYM_INT4_RTN]: + if qtype in [SYM_INT8_RTN, SYM_INT4_RTN, ASYM_INT4_RTN]: scale_ptr = ctypes.cast(scale.data.data_ptr(), ctypes.POINTER(ctypes.c_float)) if imatrix is None: ggml.ggml_quantize_tensor_rtn(src, dst, scale_ptr, qtype, n, @@ -269,7 +275,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, ggml.ggml_quantize_tensor_with_weights(src, dst, qtype, n // in_features, in_features, hist, imatrix) - if qtype in [SYM_INT8_RTN, SYM_INT4_RTN]: + if qtype in [SYM_INT8_RTN, SYM_INT4_RTN, ASYM_INT4_RTN]: return dst_tensor, scale.type(torch.float16) else: return dst_tensor diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 9dbbd1b8fde..9744e2f85f1 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -103,6 +103,7 @@ def from_pretrained(cls, *args, **kwargs): qtype_map = { "sym_int4": "sym_int4_rtn", "sym_int8": "sym_int8_rtn", + "asym_int4": "asym_int4_rtn", } invalidInputError( @@ -154,7 +155,7 @@ def from_pretrained(cls, *args, **kwargs): f"but got {quantization_group_size}" ) ) - _args = copy.deepcopy(args) + _kwargs = copy.deepcopy(kwargs) try: @@ -270,6 +271,7 @@ def optimize_npu_model(cls, *args, **kwargs): with torch.no_grad(): model.config.update({"mixed_precision": mixed_precision}) model.config.update({"group_size": quantization_group_size}) + model.config.update({"asym": qtype == "asym_int4_rtn"}) optimize_llm_pre(model, qtype, mixed_precision, quantization_group_size=quantization_group_size) cls.load_convert(qtype, model, "cpu", modules_to_not_convert, @@ -416,9 +418,9 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs) ) invalidInputError( - qtype in ["sym_int8_rtn", "sym_int4_rtn"], + qtype in ["sym_int8_rtn", "sym_int4_rtn", "asym_int4_rtn"], f"Unknown bigdl_transformers_low_bit value: {qtype}," - f" expected: sym_int8_rtn, sym_int4_rtn. " + f" expected: sym_int8_rtn, sym_int4_rtn, asym_int4_rtn. " ) if enable_cpp_backend: diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 9ac0c9a6dda..2842799b160 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -88,10 +88,13 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert, from ipex_llm.ggml.quantize import ggml_tensor_qtype iqtype = ggml_tensor_qtype[qtype] if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"): - if qtype == "sym_int4_rtn": + if qtype in ["sym_int4_rtn", "asym_int4_rtn"]: # workaround for qwen2-7B & int4 - if (layer.in_features == 3584 and layer.out_features == 152064) or \ - (layer.in_features == 18944 and layer.out_features == 3584): + if (layer.in_features == 3584 and layer.out_features == 152064): + qtype = "sym_int8_rtn" + iqtype = ggml_tensor_qtype[qtype] + if qtype == "sym_int4_rtn": + if (layer.in_features == 18944 and layer.out_features == 3584): qtype = "sym_int8_rtn" iqtype = ggml_tensor_qtype[qtype] enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0" @@ -99,8 +102,12 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert, iqtype, device=device, enable_scale_search=enable_scale_search, imatrix=imatrix) - return QuantizedLinear(qweights, scale, layer.bias, - group_size=group_size) + zero = None + # split scale to scale & zero + if qtype == "asym_int4_rtn": + scale, zero = torch.split(scale, scale.shape[0] // 2) + return QuantizedLinear(qweights, scale, zero, layer.bias, + group_size=group_size, qtype=qtype) @module_optimization @@ -111,12 +118,21 @@ def replace_with_DequantizedLinear(layer, qtype, device, modules_to_not_convert, from ipex_llm.ggml.quantize import ggml_tensor_qtype iqtype = ggml_tensor_qtype[qtype] if isinstance(layer, torch.nn.Linear) and not hasattr(layer, "qtype"): + if qtype in ["sym_int4_rtn", "asym_int4_rtn"]: + # workaround for qwen2-7B & int4 + if (layer.in_features == 3584 and layer.out_features == 152064): + qtype = "sym_int8_rtn" + iqtype = ggml_tensor_qtype[qtype] enable_scale_search = os.environ.get("IPEX_LLM_NPU_QUANTIZATION_OPT", "0") != "0" qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32), iqtype, device=device, enable_scale_search=enable_scale_search, imatrix=imatrix) - return DequantizedLinear(qweights, scale, layer.bias) + zero = None + # split scale to scale & zero + if qtype == "asym_int4_rtn": + scale, zero = torch.split(scale, scale.shape[0] // 2) + return DequantizedLinear(qweights, scale, zero, layer.bias, qtype) @module_optimization diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py index 39c9cd00fe6..64d6f30b160 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py @@ -128,7 +128,7 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, from ipex_llm.transformers.npu_models.common import split_linears if quantization_group_size == 0: n_splits_linear = 1 - if qtype == "sym_int8_rtn": + if qtype in ["sym_int8_rtn", "asym_int4_rtn"]: # do not split mlp down_proj for Qwen2-7B & sym_int8 n_splits_down_proj = 1 else: @@ -154,18 +154,21 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, # workaround for MiniCPM-2B new_lm_head_0 = SlicedLMHead(model.lm_head_0.weight, split_num=split_num, bias=model.lm_head_0.bias, use_split=True, - group_size=quantization_group_size) + group_size=quantization_group_size, + asym=(qtype == "asym_int4_rtn")) del model.lm_head_0 model.lm_head_0 = new_lm_head_0 new_lm_head_1 = SlicedLMHead(model.lm_head_1.weight, split_num=split_num, bias=model.lm_head_1.bias, use_split=True, - group_size=quantization_group_size) + group_size=quantization_group_size, + asym=(qtype == "asym_int4_rtn")) del model.lm_head_1 model.lm_head_1 = new_lm_head_1 else: new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num, bias=model.lm_head.bias, use_split=True, - group_size=quantization_group_size) + group_size=quantization_group_size, + asym=(qtype == "asym_int4_rtn")) del model.lm_head model.lm_head = new_lm_head @@ -176,11 +179,13 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision, # Do not split lm_head and use sym_int8 instead when mixed_precison is True if quantization_group_size == 0: # Do not split lm_head and use sym_int8 instead when mixed_precison is True - is_split = (not mixed_precision) and qtype == "sym_int4_rtn" + is_split = (not mixed_precision) and qtype in ["sym_int4_rtn", "asym_int4_rtn"] split_num = 14 if is_split else 1 new_lm_head = SlicedLMHead(model.lm_head.weight, split_num=split_num, bias=model.lm_head.bias, use_split=True, - group_size=quantization_group_size) + group_size=quantization_group_size, + asym=((qtype == "asym_int4_rtn") and + (not mixed_precision))) del model.lm_head model.lm_head = new_lm_head diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py index 2c4b5f37738..c8a5dd467ae 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py @@ -129,7 +129,9 @@ def __init__( self, weight: torch.Tensor, scale: torch.Tensor, + zero: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, + qtype: Optional[str] = "sym_int4_rtn", group_size: int = 0, ): """Initialize the QuantizedLinear class. @@ -137,8 +139,10 @@ def __init__( Args: weight (torch.Tensor): Linear operation weight scale (torch.Tensor): Quantization scale + zero (Optional[torch.Tensor], optional): Quantization zero for asym_int4_rtn bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None. + qtype (Optional[str], optional): qtype of this Linear Raises: RuntimeError: Quantized weight must be in torch.int8 format @@ -155,14 +159,19 @@ def __init__( ) ) self.outC, self.inC = self.weight.shape + self.zero = None if group_size != 0: self.scale = Parameter(scale, requires_grad=False) + self.zero = Parameter(zero, requires_grad=False) else: if self.weight.dtype == torch.uint8: # Int4 we need to double the input channels because weights are compressed self.inC *= 2 self.scale = Parameter(scale * math.sqrt(self.inC), requires_grad=False) + if zero is not None: + self.zero = Parameter(zero * math.sqrt(self.inC), requires_grad=False) self.bias = bias + self.qtype = qtype self.op_id = str(uuid.uuid4()) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -195,7 +204,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: ) ) - out = run_matmul(x, self.weight.data, self.scale.data, self.op_id) + zero_data = self.zero.data if self.zero is not None else None + out = run_matmul(x, self.weight.data, self.scale.data, zero_data, self.op_id) if self.bias is None: return out @@ -209,14 +219,18 @@ def __init__( self, weight: torch.Tensor, scale: torch.Tensor, + zero: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, + qtype: Optional[str] = "sym_int4_rtn", ): """Initialize the DequantizedLinear class. Args: weight (torch.Tensor): Linear operation quantized weight scale (torch.Tensor): Quantization scale + zero (Optional[torch.Tensor], optional): Quantization zero for asym_int4_rtn bias (Optional[torch.Tensor], optional): Linear operation optional bias. Defaults to None. + qtype (Optional[str], optional): qtype of this Linear Raises: RuntimeError: Quantized weight must be in torch.int8 format """ @@ -240,6 +254,9 @@ def __init__( decompressed_weight = combined_weight.view(combined_weight.size(0), -1) dequantized_weight = decompressed_weight.to(torch.float32) * \ torch.unsqueeze(scale.to(torch.float32), dim=1) + if qtype == "asym_int4_rtn" and zero is not None: + dequantized_weight = dequantized_weight + torch.unsqueeze(zero.to(torch.float32), + dim=1) self.weight = Parameter(dequantized_weight, requires_grad=False).contiguous() else: dequantized_weight = weight.to(torch.float32) * \ diff --git a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py index f306ae0e4e0..0184805996b 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py @@ -36,6 +36,7 @@ def __init__( dtype: np.dtype = np.int8, use_split: bool = False, group_size: int = 0, + asym: bool = False, ): """Initialize the LMHeadLinear class. @@ -54,11 +55,10 @@ def __init__( self.batch = batch self.split_num = split_num - if use_split: input = self.parameter((1, self.batch, self.inC)) res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype, - scale_factor=(group_size == 0)) + scale_factor=(group_size == 0), asym=asym) else: input = self.parameter((self.batch, self.inC)) split_size = self.inC // split_num // 2 * 2 @@ -69,7 +69,7 @@ def __init__( input_slice = self.slice(input, begin=[0, start_idx], end=[self.batch, end_idx]) linear_slice = self.linear(input_slice, outC, split_size, bias=False, - wt_dtype=dtype) + wt_dtype=dtype, asym=asym) if i == 0: res = linear_slice else: @@ -109,7 +109,7 @@ def run( class SlicedLMHead(nn.Module): - def __init__(self, weight, bias, split_num, use_split=False, group_size=0): + def __init__(self, weight, bias, split_num, use_split=False, group_size=0, asym=False): super().__init__() self.split_num = split_num self.outC, self.inC = weight.shape @@ -128,6 +128,7 @@ def __init__(self, weight, bias, split_num, use_split=False, group_size=0): self.lm_heads.append(new_linear) self.bias = bias self.use_split = use_split + self.asym = asym def forward(self, hidden_states): if hidden_states.size(0) * hidden_states.size(1) == 1: @@ -162,19 +163,33 @@ def get_fused_lm_head(self): np_dtype = np.uint8 if self.get_weight_dtype() == torch.uint8 else np.int8 self.fused_lm_head = LMHeadLinear(self.inC, self.outC, 1, self.split_num, False, "NPU", dtype=np_dtype, use_split=self.use_split, - group_size=self.group_size) + group_size=self.group_size, asym=self.asym) if self.use_split: weights = [] scales = [] + zeros = [] for i in range(self.split_num): weights.append(self.lm_heads[i].weight) scales.append(self.lm_heads[i].scale) - fused_lm_head_weights = (torch.stack(weights, axis=0).numpy(), - torch.stack(scales, axis=0).numpy()) + if self.lm_heads[i].zero is not None: + zeros.append(self.lm_heads[i].zero) + if len(zeros): + fused_lm_head_weights = [(torch.stack(weights, axis=0).numpy(), + torch.stack(scales, axis=0).numpy(), + torch.stack(zeros, axis=0).numpy())] + else: + fused_lm_head_weights = [(torch.stack(weights, axis=0).numpy(), + torch.stack(scales, axis=0).numpy())] else: - fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(), - self.lm_heads[i].scale.data.numpy()) - for i in range(self.split_num)] + if self.asym: + fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(), + self.lm_heads[i].scale.data.numpy(), + self.lm_heads[i].zero.data.numpy()) + for i in range(self.split_num)] + else: + fused_lm_head_weights = [(self.lm_heads[i].weight.data.numpy(), + self.lm_heads[i].scale.data.numpy()) + for i in range(self.split_num)] self.fused_lm_head.set_weights(self.lm_heads[0].op_id, fused_lm_head_weights) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py index ccf6e242d90..a1dac609243 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/mp_models_base.py @@ -59,9 +59,16 @@ def run_model( op_args_flatten = [] for w in weights: if isinstance(w, tuple): # from QuantizedLinear - op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy())) - op_args_flatten.append(op_args[-1][0]) - op_args_flatten.append(op_args[-1][1]) + if len(w) == 2: + op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy())) + op_args_flatten.append(op_args[-1][0]) + op_args_flatten.append(op_args[-1][1]) + else: + op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy(), + set_contiguous(w[2]).numpy())) + op_args_flatten.append(op_args[-1][0]) + op_args_flatten.append(op_args[-1][1]) + op_args_flatten.append(op_args[-1][2]) elif w.dtype in [torch.int8, torch.uint8]: # QuantizedLinear weight op_args.append(w.numpy()) op_args_flatten.append(op_args[-1]) @@ -104,7 +111,7 @@ def run_model( class LLMBaseNNFactory(NNFactory): def __init__(self, max_seq_len, transpose_value, dtype, profile=False, device="NPU", - n_splits_linear=1, n_splits_down_proj=1, group_size=0): + n_splits_linear=1, n_splits_down_proj=1, group_size=0, asym=False): super().__init__(profile, device) self.cache_parameter_ops = [] self.input_ops = [] @@ -117,6 +124,7 @@ def __init__(self, max_seq_len, transpose_value, dtype, profile=False, device="N self.n_splits_linear = n_splits_linear self.n_splits_down_proj = n_splits_down_proj self.group_size = group_size + self.asym = asym def attention(self, *, @@ -149,7 +157,8 @@ def attention(self, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) key_states = self.linear( @@ -160,7 +169,8 @@ def attention(self, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) value_states = self.linear( @@ -171,7 +181,8 @@ def attention(self, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) if q_bias is not None: @@ -260,7 +271,8 @@ def attention(self, attn_output, hidden_size, hidden_size, bias=False, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) return attn_output, new_key_states, new_value_states @@ -428,13 +440,15 @@ def mlp(self, hidden_states, seq_len=-1, mode="prefill"): hidden_states, self.intermediate_size, self.hidden_size, bias=False, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) mm2 = self.linear( hidden_states, self.intermediate_size, self.hidden_size, bias=False, wt_dtype=self.dtype, n_splits=self.n_splits_linear, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) # type: ignore[attr-defined] mm1 = self.eltwise_mul(self.swish(mm1), mm2) # type: ignore[attr-defined] @@ -442,7 +456,8 @@ def mlp(self, hidden_states, seq_len=-1, mode="prefill"): mm1, self.hidden_size, self.intermediate_size, bias=False, wt_dtype=self.dtype, n_splits=self.n_splits_down_proj, scale_factor=(self.group_size == 0), - is_prefill=(mode == "prefill") + is_prefill=(mode == "prefill"), + asym=self.asym ) return hidden_states @@ -558,17 +573,20 @@ def linear(self, wt_dtype: npt.DTypeLike = np.float16, n_splits: int = 1, scale_factor: bool = True, - is_prefill: bool = False): + is_prefill: bool = False, + asym: bool = False): if n_splits == 1: op = super().linear(input_node, output_channels, input_channels, bias, act_dtype, - wt_dtype, scale_factor=scale_factor) + wt_dtype, scale_factor=scale_factor, + asym=asym) else: op = super().dq_split_linear(input_node, n_splits, output_channels, input_channels, bias=bias, act_dtype=act_dtype, wt_dtype=wt_dtype, scale_factor=scale_factor, - is_prefill=is_prefill) + is_prefill=is_prefill, + asym=asym) self.linear_ops.append(op) return op @@ -580,10 +598,11 @@ def dq_split_linear(self, act_dtype: npt.DTypeLike = np.float16, wt_dtype: npt.DTypeLike = np.float16, scale_factor: bool = False, - is_prefill: bool = False): + is_prefill: bool = False, + asym: bool = False): op = super().dq_split_linear(input_node, n_splits, output_channels, input_channels, False, act_dtype, wt_dtype, scale_factor, - is_prefill=is_prefill) + is_prefill=is_prefill, asym=asym) self.linear_ops.append(op) return op diff --git a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py index 015efe10031..397739cb72a 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/qwen2_mp.py @@ -97,7 +97,8 @@ def __init__( intermediate_size, n_splits_linear: int = 1, n_splits_down_proj: int = 1, - group_size: int = 0 + group_size: int = 0, + asym: bool = False, ): super().__init__(max_seq_len=max_seq_len, transpose_value=transpose_value, @@ -106,7 +107,8 @@ def __init__( device=device, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size) + group_size=group_size, + asym=asym) self.max_seq_len = max_seq_len self.intermediate_size = intermediate_size self.dtype = dtype @@ -311,6 +313,7 @@ def __init__( n_splits_linear: int = 1, n_splits_down_proj: int = 1, group_size: int = 0, + asym: bool = False, ): super().__init__() @@ -318,8 +321,10 @@ def __init__( op_parameters = [] for w in parameters: - if isinstance(w, tuple): # from QuantizedLinear + if isinstance(w, tuple) and not asym: # from QuantizedLinear op_parameters.append((w[0].numpy(), w[1].numpy())) + elif isinstance(w, tuple) and asym: # from QuantizedLinear + op_parameters.append((w[0].numpy(), w[1].numpy(), w[2].numpy())) elif w.dtype in [torch.int8, torch.uint8]: # QuantizedLinear weight op_parameters.append(w.numpy()) elif isinstance(w, np.ndarray): # scale @@ -375,7 +380,8 @@ def __init__( dtype=np_dtype, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym, ) self.backend_decoders.append(decoder) @@ -461,6 +467,7 @@ def __init__( n_splits_linear: int = 1, n_splits_down_proj: int = 1, group_size: int = 0, + asym: bool = False, ): super().__init__() self.op_parameters = parameters @@ -491,7 +498,8 @@ def __init__( dtype=np_dtype, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym ) self.layer_norm_0 = layer_norm_0 self.layer_norm_1 = layer_norm_1 @@ -580,6 +588,7 @@ def run_decode( layer_indexs = range(layer_start, layer_end) n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list) n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list) + asym = getattr(model.config, "asym", False) for layer_idx in layer_indexs: curr_layer = model.model.layers[layer_idx] attn_layer = curr_layer.self_attn @@ -592,10 +601,17 @@ def run_decode( mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] + zeros = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), + torch.stack(zeros, axis=0))) + else: + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) @@ -630,7 +646,8 @@ def run_decode( do_print=False, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym ) dist.barrier() @@ -809,6 +826,7 @@ def run_prefill( layer_indexs = range(layer_start, layer_end) n_splits_linear = len(model.model.layers[0].mlp.gate_proj_dq_list) n_splits_down_proj = len(model.model.layers[0].mlp.down_proj_dq_list) + asym = getattr(model.config, "asym", False) for layer_idx in layer_indexs: curr_layer = model.model.layers[layer_idx] attn_layer = curr_layer.self_attn @@ -821,10 +839,17 @@ def run_prefill( mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] + zeros = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), + torch.stack(zeros, axis=0))) + else: + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) @@ -850,7 +875,8 @@ def run_prefill( transpose_value=transpose_value_cache, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym ) layer_weights.extend(weights) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py index b38299473d4..87459a99e98 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/common.py @@ -86,6 +86,7 @@ def __init__( device: str = "NPU", n_splits: int = 1, group_size: int = 0, + asym: bool = False ): super().__init__(max_seq_len=max_seq_len, transpose_value=transpose_value, @@ -119,6 +120,7 @@ def __init__( hidden_states, self.vocab_size, self.hidden_size, bias=False, wt_dtype=self.dtype, n_splits=n_splits, scale_factor=(group_size == 0), + asym=asym ) # define outputs diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 2e6b249c1a5..337736a7ea8 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -201,7 +201,7 @@ def convert_llm(model: torch.nn.Module, layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" if group_size == 0: n_splits_linear = 1 - if qtype == "sym_int8_rtn": + if qtype in ["sym_int8_rtn", "asym_int4_rtn"]: # do not split mlp down_proj for Qwen2-7B & sym_int8 n_splits_down_proj = 1 else: @@ -434,6 +434,12 @@ def convert_llm_for_deploy(model: torch.nn.Module, os.mkdir(weight_dir) layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" + lm_head_low_bit = getattr(model.config, "bigdl_transformers_low_bit", "sym_int4_rtn") + if not isinstance(model.lm_head, SlicedLMHead): + lm_head_low_bit = model.lm_head.qtype + else: + lm_head_low_bit = model.lm_head.lm_heads[0].qtype + if model.config.model_type == "qwen2": if group_size == 0: if model.config.hidden_size == 1536: @@ -456,7 +462,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, "weight_num": 7, "weight_idx": 8, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj} + "n_splits_down_proj": n_splits_down_proj, + "lm_head_low_bit": lm_head_low_bit} model.config.update(update_dict) model.config.save_pretrained(save_directory) @@ -517,7 +524,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, "embedding_post": embedding_post, "cos_sin_input": cos_sin_input, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj} + "n_splits_down_proj": n_splits_down_proj, + "lm_head_low_bit": lm_head_low_bit} model.config.update(update_dict) model.config.save_pretrained(save_directory) @@ -556,7 +564,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, "model_type": "minicpm", "embedding_post": True, "n_splits_linear": n_splits_linear, - "n_splits_down_proj": n_splits_down_proj} + "n_splits_down_proj": n_splits_down_proj, + "lm_head_low_bit": lm_head_low_bit} model.config.update(update_dict) model.config.save_pretrained(save_directory) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py index e4b318244ce..bb8003f06a7 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/qwen.py @@ -31,17 +31,32 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir, model_norm = model.model.norm lm_head = model.lm_head lm_head_n_splits = 1 + asym = getattr(model.config, "asym", False) + if not isinstance(lm_head, SlicedLMHead): - weights = [(lm_head.weight, lm_head.scale)] + asym = lm_head.qtype == "asym_int4_rtn" + if asym: + weights = [(lm_head.weight, lm_head.scale, lm_head.zero)] + else: + weights = [(lm_head.weight, lm_head.scale)] else: lm_heads = lm_head.lm_heads + asym = lm_heads[0].qtype == "asym_int4_rtn" lm_head_weights = [] scales = [] + zeros = [] for l in lm_heads: lm_head_weights.append(l.weight) scales.append(l.scale) - weights = [(torch.stack(lm_head_weights, axis=0), - torch.stack(scales, axis=0))] + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): + weights = [(torch.stack(lm_head_weights, axis=0), + torch.stack(scales, axis=0), + torch.stack(zeros, axis=0))] + else: + weights = [(torch.stack(lm_head_weights, axis=0), + torch.stack(scales, axis=0))] lm_head_n_splits = lm_head.split_num if isinstance(weights[0], tuple): np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8 @@ -60,6 +75,7 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir, vocab_size=vocab_size, n_splits=lm_head_n_splits, group_size=group_size, + asym=asym ) last_blob_path = update_names_of_IR_and_export_blob(new_lm_head, f"lm_head", @@ -67,9 +83,15 @@ def convert_lm_head_and_embedding(model, temp_dir, weight_dir, # save weights bins files if not isinstance(lm_head, SlicedLMHead): - weight_numpy = [ - lm_head.weight.data.numpy(), lm_head.scale.data.numpy(), - ] + if not asym: + weight_numpy = [ + lm_head.weight.data.numpy(), lm_head.scale.data.numpy(), + ] + else: + weight_numpy = [ + lm_head.weight.data.numpy(), lm_head.scale.data.numpy(), + lm_head.zero.data.numpy() + ] else: weight_numpy = [v.numpy() for v in weights[0]] @@ -104,6 +126,7 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, head_dim = model.model.layers[0].self_attn.head_dim intermediate_size = model.config.intermediate_size rms_norm_eps = model.config.rms_norm_eps + asym = getattr(model.config, "asym", False) from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer curr_layer = model.model.layers[layer_idx] @@ -117,10 +140,17 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] + zeros = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), + torch.stack(zeros, axis=0))) + else: + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) q_bias = attn_layer.q_proj_dq_list.q_proj_dq_0.bias.to(torch.float16) k_bias = attn_layer.k_proj_dq_list.k_proj_dq_0.bias.to(torch.float16) @@ -164,7 +194,8 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, dtype=np_dtype, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym ) rest_blob_path = update_names_of_IR_and_export_blob(single_decoder, decoder_name, @@ -188,11 +219,23 @@ def convert_qwen_layer(model, layer_idx, n_splits_linear, n_splits_down_proj, k_bias.data.numpy().tofile(k_bias_bin_file) v_bias.data.numpy().tofile(v_bias_bin_file) # 6, 7 are past k/v - for idx, (weight, scale) in enumerate(weights): - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2}.bin") - weight.numpy().tofile(bin_file) - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+5+idx*2+1}.bin") - scale.numpy().tofile(bin_file) + if not asym: + for idx, (weight, scale) in enumerate(weights): + bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin") + weight.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") + scale.numpy().tofile(bin_file) + else: + for idx, (weight, scale, zero) in enumerate(weights): + bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin") + weight.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*3+1}.bin") + scale.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin") + zero.numpy().tofile(bin_file) del single_decoder @@ -207,6 +250,7 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down rms_norm_eps = model.config.rms_norm_eps layer_num = len(model.model.layers) fused_layer_num = layer_num // fused_layers + asym = getattr(model.config, "asym", False) from ipex_llm.transformers.npu_models.qwen2_mp import LowBitQwenMultiDecoderlayer for i in range(fused_layers): @@ -233,10 +277,17 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down mlp_layer.down_proj_dq_list]: l_weights = [] scales = [] + zeros = [] for l in layer_list: l_weights.append(l.weight) scales.append(l.scale) - weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) + if l.zero is not None: + zeros.append(l.zero) + if len(zeros): + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0), + torch.stack(zeros, axis=0))) + else: + weights.append((torch.stack(l_weights, axis=0), torch.stack(scales, axis=0))) cached_cos = curr_layer.self_attn.rotary_emb.cos_cached.to(torch.float16) cached_sin = curr_layer.self_attn.rotary_emb.sin_cached.to(torch.float16) @@ -264,12 +315,25 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down k_biases[-1].data.numpy().tofile(k_bias_bin_file) v_biases[-1].data.numpy().tofile(v_bias_bin_file) # 6, 7 are past k/v - for idx, (weight, scale) in enumerate(weights): - bin_file = os.path.join(weight_dir, f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin") - weight.numpy().tofile(bin_file) - bin_file = os.path.join(weight_dir, - f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") - scale.numpy().tofile(bin_file) + if not asym: + for idx, (weight, scale) in enumerate(weights): + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*2}.bin") + weight.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*2+1}.bin") + scale.numpy().tofile(bin_file) + else: + for idx, (weight, scale, zero) in enumerate(weights): + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*3}.bin") + weight.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*3+1}.bin") + scale.numpy().tofile(bin_file) + bin_file = os.path.join(weight_dir, + f"model_{layer_idx}_input_{st_idx+3+idx*3+2}.bin") + zero.numpy().tofile(bin_file) if isinstance(weights[0], tuple): np_dtype = np.int8 if weights[0][0].dtype == torch.int8 else np.uint8 @@ -296,7 +360,8 @@ def convert_fused_qwen_layer(model, fused_layers, n_splits_linear, n_splits_down dtype=np_dtype, n_splits_linear=n_splits_linear, n_splits_down_proj=n_splits_down_proj, - group_size=group_size + group_size=group_size, + asym=asym ) update_names_of_IR_and_export_blob(fused_decoder, f"decoder_layer_{i}",