From 397d6619e68fd9be06da55d9e609bd2ca063aab5 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 19 Jun 2024 06:28:15 -0700 Subject: [PATCH] support layerwise Signed-off-by: changwangss --- .../transformers/llm/quantization/utils.py | 5 +++++ .../transformers/modeling/modeling_auto.py | 18 +++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index 481505502ca..8eeb6ea4b3d 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -25,6 +25,7 @@ from datasets import load_dataset from neural_compressor import quantization from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear +from neural_compressor.utils.pytorch import load from neural_compressor.utils.utility import LazyImport from neural_compressor.config import PostTrainingQuantConfig from intel_extension_for_transformers.tools.utils import ( @@ -583,6 +584,10 @@ def default_calib_func(model): inc_model = quantization.fit( model, conf, calib_func=calib_func, calib_dataloader=calib_dataloader ) + if config.layer_wise: + inc_model.save("./tmp") + inc_model = load("./tmp", model, weight_only=True, layer_wise=True) + return inc_model.eval() inc_model.eval() if device == "xpu" or device == torch.device("xpu"): diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 3cbd2bb2f25..a2b357656c7 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -74,6 +74,7 @@ from accelerate import init_empty_weights from huggingface_hub import hf_hub_download from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear +from neural_compressor.adaptor.torch_utils.layer_wise_quant import load_empty_model from neural_compressor.model.torch_model import PyTorchFXModel from threading import Thread from transformers.configuration_utils import PretrainedConfig @@ -778,13 +779,16 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: if quantization_config.quant_method.value in ["teq", "awq"] else False ) - model = cls.ORIG_MODEL.from_pretrained( - pretrained_model_name_or_path, - *model_args, - config=config, - **kwargs, - ) - model.config.update({"low_cpu_mem_usage": True}) + if quantization_config.layer_wise: + model = load_empty_model(pretrained_model_name_or_path, torchscript=True) + else: + model = cls.ORIG_MODEL.from_pretrained( + pretrained_model_name_or_path, + *model_args, + config=config, + **kwargs, + ) + model.config.update({"low_cpu_mem_usage": True}) model.eval() if use_xpu: