From 0f4ecd94141822ad1ee4f536f1a85f37ed5026bf Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Sun, 31 Mar 2024 14:39:28 +0800 Subject: [PATCH 1/2] add FP8Config Signed-off-by: Mengni Wang --- .../transformers/__init__.py | 1 + .../transformers/modeling/modeling_auto.py | 75 +++++++++++++++++- .../transformers/utils/__init__.py | 1 + .../transformers/utils/config.py | 76 +++++++++++++++++++ 4 files changed, 150 insertions(+), 3 deletions(-) diff --git a/intel_extension_for_transformers/transformers/__init__.py b/intel_extension_for_transformers/transformers/__init__.py index 9bb247f7dd0..400082f10d8 100644 --- a/intel_extension_for_transformers/transformers/__init__.py +++ b/intel_extension_for_transformers/transformers/__init__.py @@ -44,6 +44,7 @@ MixedPrecisionConfig, BitsAndBytesConfig, SmoothQuantConfig, + FP8Config, RtnConfig, AwqConfig, TeqConfig, diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 9b84e79ce75..75ef25c53ab 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -42,6 +42,7 @@ BitsAndBytesConfig, MixedPrecisionConfig, SmoothQuantConfig, + FP8Config, RtnConfig, AwqConfig, TeqConfig, @@ -69,6 +70,8 @@ ) from ...tools.utils import get_gpu_family, is_ipex_available from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear +from neural_compressor.torch.quantization import quantize +from neural_compressor.torch.quantization import FP8Config as INCFP8Config from transformers.configuration_utils import PretrainedConfig from transformers import AutoConfig from transformers.utils import is_accelerate_available, is_bitsandbytes_available @@ -361,6 +364,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): use_xpu = ( True if device_map == torch.device("xpu") or device_map == "xpu" else False ) + use_hpu = ( + True if device_map == torch.device("hpu") or device_map == "hpu" else False + ) config = kwargs.pop("config", None) model_hub = kwargs.pop("model_hub", "huggingface") @@ -380,12 +386,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): quantization_config = kwargs.pop("quantization_config", None) if kwargs.get("use_llm_runtime", None) is not None: - use_neural_speed = kwargs.pop("use_llm_runtime", True) and not use_xpu + use_neural_speed = kwargs.pop("use_llm_runtime", True) and not use_xpu and not use_hpu logger.warning( "use_llm_runtime is deprecated in version 1.3.2, please use_neural_speed instead." ) elif kwargs.get("use_neural_speed", None) is not None: - use_neural_speed = kwargs.pop("use_neural_speed", True) and not use_xpu + use_neural_speed = kwargs.pop("use_neural_speed", True) and not use_xpu and not use_hpu else: if hasattr(config, "model_type") == False: logger.error( @@ -393,7 +399,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) exit(0) - if config.model_type in cls.model_type_list and not use_xpu: + if config.model_type in cls.model_type_list and not use_xpu and not use_hpu: if ( isinstance(quantization_config, GPTQConfig) and config.model_type not in cls.model_type_list_for_gptq @@ -452,6 +458,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): and is_bitsandbytes_available() and not use_cpu and not use_xpu + and not use_hpu ): model = cls.ORIG_MODEL.from_pretrained( pretrained_model_name_or_path, @@ -646,6 +653,68 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model.save_pretrained = types.MethodType(save_low_bit, model) logger.info("WeightOnlyQuant done.") + elif isinstance(quantization_config, FP8Config) and use_hpu: + model = cls.ORIG_MODEL.from_pretrained( + pretrained_model_name_or_path, + *model_args, + config=config, + ) + if quantization_config.approach == "dynamic": + from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic + model = quantize_dynamic(model, quantization_config.precision, inplace=True) + elif quantization_config.approach == "static": + qconfig = INCFP8Config(w_dtype=quantization_config.precision, act_dtype=quantization_config.precision, approach="static") + if quantization_config.skip_lm_head: + fp32_config = INCFP8Config(w_dtype="fp32", act_dtype="fp32") + qconfig.set_local("lm_head", fp32_config) + + # calibration function + calib_func = quantization_config.calib_func + tokenizer = quantization_config.tokenizer + if calib_func is None: + if quantization_config.tokenizer is None: + logger.error( + "Please provide the tokenizer or provide calib_func directly," + + " the following is how to get tokenizer. \n" + + " from transformer import AutoTokenizer \n" + + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" + ) + exit(0) + + calib_dataset = quantization_config.calib_dataset + calib_shuffle = quantization_config.calib_shuffle + calib_iters = quantization_config.calib_iters + calib_padding = quantization_config.calib_padding + calib_len = quantization_config.calib_len + + # dataset + from datasets import load_dataset + calib_dataset = load_dataset(calib_dataset, split="train").select(range(100)) + if calib_shuffle: + calib_dataset = calib_dataset.shuffle(seed=42) + calib_data = [] + for examples in calib_dataset: + calib_data.append( + tokenizer( + examples["text"], + return_tensors="pt", + max_length=calib_len, + padding="max_length", + truncation=True + ) + ) + + def calib_func(model): + for i, calib_input in enumerate(calib_data): + if i >= calib_iters: + break + model( + input_ids=calib_input["input_ids"].to('hpu'), + attention_mask=calib_input["attention_mask"].to('hpu'), + ) + calib_func = calib_func + model = quantize(model, qconfig, calib_func, inplace=True) + logger.info("FP8 Quantization done.") elif isinstance(quantization_config, SmoothQuantConfig): try: import intel_extension_for_pytorch as ipex diff --git a/intel_extension_for_transformers/transformers/utils/__init__.py b/intel_extension_for_transformers/transformers/utils/__init__.py index 81e9f1d19bf..af9437032de 100644 --- a/intel_extension_for_transformers/transformers/utils/__init__.py +++ b/intel_extension_for_transformers/transformers/utils/__init__.py @@ -21,6 +21,7 @@ MixedPrecisionConfig, BitsAndBytesConfig, SmoothQuantConfig, + FP8Config, SparsityConfig, RtnConfig, AwqConfig, diff --git a/intel_extension_for_transformers/transformers/utils/config.py b/intel_extension_for_transformers/transformers/utils/config.py index 3e083ba37dc..93db86b04eb 100644 --- a/intel_extension_for_transformers/transformers/utils/config.py +++ b/intel_extension_for_transformers/transformers/utils/config.py @@ -628,6 +628,82 @@ def get_config_dict( ) +class FP8Config(ITREXQuantizationConfigMixin): + """ + This is a wrapper class about all possible attributes and features that you can play with a model that has been + loaded using `auto-awq` library awq quantization relying on auto_awq backend. + + Args: + precision (`str`, *optional*, defaults to fp8_e4m3): + The data type of weight and activation. + approach (`str`, *optional*, defaults to static): + The approach for quantization. + """ + + def __init__( + self, + precision: str = "fp8_e4m3", + approach: str = "static", + **kwargs, + ): + self.precision = precision + self.approach = approach + self.device = kwargs.get("device", "hpu") + self.calib_dataloader = kwargs.get("calib_dataloader", None) + self.calib_dataset = kwargs.get("calib_dataset", "NeelNanda/pile-10k") + self.calib_func = kwargs.get("calib_func", None) + self.calib_padding = kwargs.get("calib_padding", False) + self.calib_len = kwargs.get("calib_len", 64) + self.calib_shuffle = kwargs.get("calib_shuffle", True) + self.calib_iters = kwargs.get("calib_iters", 100) + self.skip_lm_head = kwargs.get("skip_lm_head", False) + self.tokenizer = kwargs.get("tokenizer", None) + self.post_init_fp8() + + def post_init_fp8(self): + r""" + Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. + """ + if self.precision is not None and self.precision not in ["fp8_e5m2", "fp8_e4m3"]: + raise ValueError("precision must be in ['fp8_e5m2', 'fp8_e4m3'].") + elif self.precision is None: + self.precision = "fp8_e4m3" + + if self.approach is None: + self.approach = "static" + elif self.approach not in ["static", "dynamic"]: + raise ValueError( + f"Only support 'static' and 'dynamic' approach but found {self.approach}" + ) + + if self.device is not None and self.device not in ["hpu", torch.device("hpu")]: + raise ValueError(f"Only support hpu device but found {self.device}") + elif self.device is None: + self.device = "hpu" + + def to_diff_dict(self) -> Dict[str, Any]: + """ + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = FP8Config().to_dict() + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if value != default_config_dict[key]: + serializable_config_dict[key] = value + + return serializable_config_dict + + class RtnConfig(ITREXQuantizationConfigMixin): def __init__( self, From 3a3568c9112c7c3166d39eec6e15748019cbaa80 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Mon, 1 Apr 2024 10:57:02 +0800 Subject: [PATCH 2/2] Update modeling_auto.py Signed-off-by: Wang, Mengni --- .../transformers/modeling/modeling_auto.py | 144 +++++++++--------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index 75ef25c53ab..c1a9f326333 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -42,7 +42,7 @@ BitsAndBytesConfig, MixedPrecisionConfig, SmoothQuantConfig, - FP8Config, + FP8Config, RtnConfig, AwqConfig, TeqConfig, @@ -70,8 +70,8 @@ ) from ...tools.utils import get_gpu_family, is_ipex_available from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear -from neural_compressor.torch.quantization import quantize -from neural_compressor.torch.quantization import FP8Config as INCFP8Config +from neural_compressor.torch.quantization import quantize +from neural_compressor.torch.quantization import FP8Config as INCFP8Config from transformers.configuration_utils import PretrainedConfig from transformers import AutoConfig from transformers.utils import is_accelerate_available, is_bitsandbytes_available @@ -364,9 +364,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): use_xpu = ( True if device_map == torch.device("xpu") or device_map == "xpu" else False ) - use_hpu = ( - True if device_map == torch.device("hpu") or device_map == "hpu" else False - ) + use_hpu = ( + True if device_map == torch.device("hpu") or device_map == "hpu" else False + ) config = kwargs.pop("config", None) model_hub = kwargs.pop("model_hub", "huggingface") @@ -386,12 +386,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): quantization_config = kwargs.pop("quantization_config", None) if kwargs.get("use_llm_runtime", None) is not None: - use_neural_speed = kwargs.pop("use_llm_runtime", True) and not use_xpu and not use_hpu + use_neural_speed = kwargs.pop("use_llm_runtime", True) and not use_xpu and not use_hpu logger.warning( "use_llm_runtime is deprecated in version 1.3.2, please use_neural_speed instead." ) elif kwargs.get("use_neural_speed", None) is not None: - use_neural_speed = kwargs.pop("use_neural_speed", True) and not use_xpu and not use_hpu + use_neural_speed = kwargs.pop("use_neural_speed", True) and not use_xpu and not use_hpu else: if hasattr(config, "model_type") == False: logger.error( @@ -399,7 +399,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) exit(0) - if config.model_type in cls.model_type_list and not use_xpu and not use_hpu: + if config.model_type in cls.model_type_list and not use_xpu and not use_hpu: if ( isinstance(quantization_config, GPTQConfig) and config.model_type not in cls.model_type_list_for_gptq @@ -458,7 +458,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): and is_bitsandbytes_available() and not use_cpu and not use_xpu - and not use_hpu + and not use_hpu ): model = cls.ORIG_MODEL.from_pretrained( pretrained_model_name_or_path, @@ -653,68 +653,68 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model.save_pretrained = types.MethodType(save_low_bit, model) logger.info("WeightOnlyQuant done.") - elif isinstance(quantization_config, FP8Config) and use_hpu: - model = cls.ORIG_MODEL.from_pretrained( - pretrained_model_name_or_path, - *model_args, - config=config, - ) - if quantization_config.approach == "dynamic": - from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic - model = quantize_dynamic(model, quantization_config.precision, inplace=True) - elif quantization_config.approach == "static": - qconfig = INCFP8Config(w_dtype=quantization_config.precision, act_dtype=quantization_config.precision, approach="static") - if quantization_config.skip_lm_head: - fp32_config = INCFP8Config(w_dtype="fp32", act_dtype="fp32") - qconfig.set_local("lm_head", fp32_config) - - # calibration function - calib_func = quantization_config.calib_func - tokenizer = quantization_config.tokenizer - if calib_func is None: - if quantization_config.tokenizer is None: - logger.error( - "Please provide the tokenizer or provide calib_func directly," - + " the following is how to get tokenizer. \n" - + " from transformer import AutoTokenizer \n" - + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" - ) - exit(0) - - calib_dataset = quantization_config.calib_dataset - calib_shuffle = quantization_config.calib_shuffle - calib_iters = quantization_config.calib_iters - calib_padding = quantization_config.calib_padding - calib_len = quantization_config.calib_len - - # dataset - from datasets import load_dataset - calib_dataset = load_dataset(calib_dataset, split="train").select(range(100)) - if calib_shuffle: - calib_dataset = calib_dataset.shuffle(seed=42) - calib_data = [] - for examples in calib_dataset: - calib_data.append( - tokenizer( - examples["text"], - return_tensors="pt", - max_length=calib_len, - padding="max_length", - truncation=True - ) - ) - - def calib_func(model): - for i, calib_input in enumerate(calib_data): - if i >= calib_iters: - break - model( - input_ids=calib_input["input_ids"].to('hpu'), - attention_mask=calib_input["attention_mask"].to('hpu'), - ) - calib_func = calib_func - model = quantize(model, qconfig, calib_func, inplace=True) - logger.info("FP8 Quantization done.") + elif isinstance(quantization_config, FP8Config) and use_hpu: + model = cls.ORIG_MODEL.from_pretrained( + pretrained_model_name_or_path, + *model_args, + config=config, + ) + if quantization_config.approach == "dynamic": + from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic + model = quantize_dynamic(model, quantization_config.precision, inplace=True) + elif quantization_config.approach == "static": + qconfig = INCFP8Config(w_dtype=quantization_config.precision, act_dtype=quantization_config.precision, approach="static") + if quantization_config.skip_lm_head: + fp32_config = INCFP8Config(w_dtype="fp32", act_dtype="fp32") + qconfig.set_local("lm_head", fp32_config) + + # calibration function + calib_func = quantization_config.calib_func + tokenizer = quantization_config.tokenizer + if calib_func is None: + if quantization_config.tokenizer is None: + logger.error( + "Please provide the tokenizer or provide calib_func directly," + + " the following is how to get tokenizer. \n" + + " from transformer import AutoTokenizer \n" + + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n" + ) + exit(0) + + calib_dataset = quantization_config.calib_dataset + calib_shuffle = quantization_config.calib_shuffle + calib_iters = quantization_config.calib_iters + calib_padding = quantization_config.calib_padding + calib_len = quantization_config.calib_len + + # dataset + from datasets import load_dataset + calib_dataset = load_dataset(calib_dataset, split="train").select(range(100)) + if calib_shuffle: + calib_dataset = calib_dataset.shuffle(seed=42) + calib_data = [] + for examples in calib_dataset: + calib_data.append( + tokenizer( + examples["text"], + return_tensors="pt", + max_length=calib_len, + padding="max_length", + truncation=True + ) + ) + + def calib_func(model): + for i, calib_input in enumerate(calib_data): + if i >= calib_iters: + break + model( + input_ids=calib_input["input_ids"].to('hpu'), + attention_mask=calib_input["attention_mask"].to('hpu'), + ) + calib_func = calib_func + model = quantize(model, qconfig, calib_func, inplace=True) + logger.info("FP8 Quantization done.") elif isinstance(quantization_config, SmoothQuantConfig): try: import intel_extension_for_pytorch as ipex