From dc19bc1652fd930068d5151db239c719795ab07a Mon Sep 17 00:00:00 2001 From: George Date: Tue, 1 Oct 2024 21:35:14 +0000 Subject: [PATCH 1/9] populate quantization_config for kv-cache-scheme only configs --- src/transformers/utils/quantization_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 8be0bb672e51b8..0223144ac9d7b8 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -1105,7 +1105,7 @@ def __init__( self.sparsity_config = None # parse from dict to load nested QuantizationScheme objects - if config_groups: + if config_groups or kv_cache_scheme: self.quantization_config = QuantizationConfig.parse_obj( { "config_groups": config_groups, From b8a2d49a0a293d3d3a194bf740bdbfb096fa5ad7 Mon Sep 17 00:00:00 2001 From: George Date: Wed, 30 Oct 2024 19:37:44 +0000 Subject: [PATCH 2/9] make compressed-tensors quantized models trainable --- .../quantizers/quantizer_compressed_tensors.py | 8 ++++---- src/transformers/trainer.py | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py index 347be5c6654fe7..5c092a67e6e1f7 100644 --- a/src/transformers/quantizers/quantizer_compressed_tensors.py +++ b/src/transformers/quantizers/quantizer_compressed_tensors.py @@ -66,11 +66,11 @@ def _process_model_before_weight_loading(self, model, **kwargs): apply_quantization_config(model, ct_quantization_config, run_compressed=True) def _process_model_after_weight_loading(self, model, **kwargs): - pass + model.is_compressed_tensors_quantized = True @property - def is_trainable(self): - return False + def is_trainable(self) -> bool: + return True def is_serializable(self, safe_serialization=None): - return False + return True diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 8367e2e413c13e..c5f4746a0c4fde 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -490,6 +490,9 @@ def __init__( getattr(model, "hf_quantizer", None) is not None and model.hf_quantizer.is_trainable ) + _is_model_quantized_and_trainable = getattr(model, "is_quantized", False) and getattr( + model, "is_compressed_tensors_quantized", False + ) # Filter out quantized + compiled models if _is_quantized_and_base_model and hasattr(model, "_orig_mod"): raise ValueError( @@ -497,7 +500,7 @@ def __init__( ) # At this stage the model is already loaded - if _is_quantized_and_base_model and not _is_peft_model(model): + if _is_quantized_and_base_model and not _is_peft_model(model) and not _is_model_quantized_and_trainable: raise ValueError( "You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of" " the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft" From a8a757a8dbdffa459cece8a8a6d40acbec619120 Mon Sep 17 00:00:00 2001 From: George Date: Wed, 30 Oct 2024 21:43:44 +0000 Subject: [PATCH 3/9] populate versions on quant config --- src/transformers/quantizers/quantizer_compressed_tensors.py | 3 ++- src/transformers/utils/quantization_config.py | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py index 5c092a67e6e1f7..08cd5ae5c7096d 100644 --- a/src/transformers/quantizers/quantizer_compressed_tensors.py +++ b/src/transformers/quantizers/quantizer_compressed_tensors.py @@ -38,6 +38,7 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): from compressed_tensors.compressors import ModelCompressor self.compressor = ModelCompressor.from_compression_config(quantization_config) + breakpoint() def validate_environment(self, *args, **kwargs): if not is_compressed_tensors_available(): @@ -72,5 +73,5 @@ def _process_model_after_weight_loading(self, model, **kwargs): def is_trainable(self) -> bool: return True - def is_serializable(self, safe_serialization=None): + def is_serializable(self, safe_serialization=None) -> bool: return True diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 026a2066798574..98e051119d5028 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -1102,6 +1102,7 @@ def __init__( ignore: Optional[List[str]] = None, sparsity_config: Dict[str, Any] = None, quant_method: str = "compressed-tensors", + version: Optional[str] = None, **kwargs, ): from compressed_tensors import QuantizationConfig @@ -1121,6 +1122,7 @@ def __init__( "kv_cache_scheme": kv_cache_scheme, "global_compression_ratio": global_compression_ratio, "ignore": ignore, + "version": version, **kwargs, } ) @@ -1150,11 +1152,15 @@ def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs): Returns: [`QuantizationConfigMixin`]: The configuration object instantiated from those parameters. """ + ver = config_dict.get("version", None) + if "quantization_config" in config_dict: config_dict = dict( sparsity_config=config_dict.get("sparsity_config"), **config_dict["quantization_config"], ) + if ver is not None: + config_dict["version"] = ver return super().from_dict(config_dict, return_unused_kwargs=return_unused_kwargs, **kwargs) From 95a36a37d2e804760834799fdaf8525ac19a98dc Mon Sep 17 00:00:00 2001 From: George Date: Mon, 4 Nov 2024 14:40:55 +0000 Subject: [PATCH 4/9] pass oneshot then finetune --- .../quantizers/quantizer_compressed_tensors.py | 2 ++ src/transformers/utils/quantization_config.py | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py index 08cd5ae5c7096d..9aef45ee62c630 100644 --- a/src/transformers/quantizers/quantizer_compressed_tensors.py +++ b/src/transformers/quantizers/quantizer_compressed_tensors.py @@ -71,7 +71,9 @@ def _process_model_after_weight_loading(self, model, **kwargs): @property def is_trainable(self) -> bool: + """Models quantized using compressed tensors can be finetuned""" return True def is_serializable(self, safe_serialization=None) -> bool: + """Models quantized using compressed tensors can be saved to disk""" return True diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 98e051119d5028..01f745533c5ccf 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -1092,6 +1092,8 @@ class CompressedTensorsConfig(QuantizationConfigMixin): do not override, should be compressed-tensors """ + QUANTIZATION_NAME = "compressed-tensors" + def __init__( self, config_groups: Dict[str, Union["QuantizationScheme", List[str]]] = None, # noqa: F821 @@ -1166,16 +1168,20 @@ def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs): def to_dict(self) -> Dict[str, Any]: """ + Quantization config to be added to config.json + Serializes this instance to a Python dictionary. Returns: `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. """ - quantization_config = self.quantization_config.dict() if self.quantization_config is not None else None - sparsity_config = self.sparsity_config.dict() if self.sparsity_config is not None else None + quantization_config = {} + if self.quantization_config is not None: + quantization_config = self.quantization_config.dict() + else: + quantization_config["quant_method"] = self.QUANTIZATION_NAME - return { - "quantization_config": quantization_config, - "sparsity_config": sparsity_config, - } + quantization_config["sparsity_config"] = self.sparsity_config.dict() + + return quantization_config def to_diff_dict(self) -> Dict[str, Any]: """ From 4cf4fe2d04642de8a14d67351ee11771b3c6efd0 Mon Sep 17 00:00:00 2001 From: George Date: Mon, 4 Nov 2024 14:42:41 +0000 Subject: [PATCH 5/9] remove breakpoint --- src/transformers/quantizers/quantizer_compressed_tensors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py index 9aef45ee62c630..e54f946701e80d 100644 --- a/src/transformers/quantizers/quantizer_compressed_tensors.py +++ b/src/transformers/quantizers/quantizer_compressed_tensors.py @@ -38,7 +38,6 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): from compressed_tensors.compressors import ModelCompressor self.compressor = ModelCompressor.from_compression_config(quantization_config) - breakpoint() def validate_environment(self, *args, **kwargs): if not is_compressed_tensors_available(): From dae8b3ce6ad1ff675e836b1e042a34addc01f30c Mon Sep 17 00:00:00 2001 From: George Date: Mon, 4 Nov 2024 18:37:51 +0000 Subject: [PATCH 6/9] SunMarc comments and fix to_dict logic --- .../quantizers/quantizer_compressed_tensors.py | 7 ++++++- src/transformers/trainer.py | 5 +++-- src/transformers/utils/quantization_config.py | 8 ++++---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py index e54f946701e80d..292cf4acc86c5d 100644 --- a/src/transformers/quantizers/quantizer_compressed_tensors.py +++ b/src/transformers/quantizers/quantizer_compressed_tensors.py @@ -66,13 +66,18 @@ def _process_model_before_weight_loading(self, model, **kwargs): apply_quantization_config(model, ct_quantization_config, run_compressed=True) def _process_model_after_weight_loading(self, model, **kwargs): - model.is_compressed_tensors_quantized = True + pass @property def is_trainable(self) -> bool: """Models quantized using compressed tensors can be finetuned""" return True + @property + def is_qat_trainable(self) -> bool: + """Loaded Models can carry out quantization aware training""" + return True + def is_serializable(self, safe_serialization=None) -> bool: """Models quantized using compressed tensors can be saved to disk""" return True diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 2fd360fb4062de..f38f5f0bdc4af6 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -522,9 +522,10 @@ def __init__( getattr(model, "hf_quantizer", None) is not None and model.hf_quantizer.is_trainable ) - _is_model_quantized_and_trainable = getattr(model, "is_quantized", False) and getattr( - model, "is_compressed_tensors_quantized", False + _is_model_quantized_and_trainable = getattr(model, "hf_quantizer", None) is not None and getattr( + model, "hf_quantizer", False ) + # Filter out quantized + compiled models if _is_quantized_and_base_model and hasattr(model, "_orig_mod"): raise ValueError( diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 01f745533c5ccf..a3a8667c1a01ce 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -1154,15 +1154,12 @@ def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs): Returns: [`QuantizationConfigMixin`]: The configuration object instantiated from those parameters. """ - ver = config_dict.get("version", None) if "quantization_config" in config_dict: config_dict = dict( sparsity_config=config_dict.get("sparsity_config"), **config_dict["quantization_config"], ) - if ver is not None: - config_dict["version"] = ver return super().from_dict(config_dict, return_unused_kwargs=return_unused_kwargs, **kwargs) @@ -1179,7 +1176,10 @@ def to_dict(self) -> Dict[str, Any]: else: quantization_config["quant_method"] = self.QUANTIZATION_NAME - quantization_config["sparsity_config"] = self.sparsity_config.dict() + if self.sparsity_config is not None: + quantization_config["sparsity_config"] = self.sparsity_config.dict() + else: + quantization_config["sparsity_config"] = {} return quantization_config From 4387053115956733cf5b09c71d455b3bab7a564d Mon Sep 17 00:00:00 2001 From: George Date: Thu, 7 Nov 2024 15:03:21 +0000 Subject: [PATCH 7/9] lint --- src/transformers/utils/quantization_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index a3a8667c1a01ce..2d394bb60914eb 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -1090,6 +1090,7 @@ class CompressedTensorsConfig(QuantizationConfigMixin): configuration for sparsity compression quant_method (`str`, *optional*, defaults to `"compressed-tensors"`): do not override, should be compressed-tensors + version (`Optional`, *optional*): """ QUANTIZATION_NAME = "compressed-tensors" From e4db20711062dcaea2b684d59311ff71597c1edc Mon Sep 17 00:00:00 2001 From: George Date: Wed, 13 Nov 2024 17:20:53 +0000 Subject: [PATCH 8/9] lint --- src/transformers/quantizers/quantizer_compressed_tensors.py | 2 +- src/transformers/utils/quantization_config.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py index 292cf4acc86c5d..61e940886d942f 100644 --- a/src/transformers/quantizers/quantizer_compressed_tensors.py +++ b/src/transformers/quantizers/quantizer_compressed_tensors.py @@ -65,7 +65,7 @@ def _process_model_before_weight_loading(self, model, **kwargs): ct_quantization_config = self.compressor.quantization_config apply_quantization_config(model, ct_quantization_config, run_compressed=True) - def _process_model_after_weight_loading(self, model, **kwargs): + def _process_model_after_weight_loading(self, model, **kwargs) -> None: pass @property diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index fe03504512789c..9bcd40f15df4cd 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -1090,7 +1090,6 @@ class CompressedTensorsConfig(QuantizationConfigMixin): configuration for sparsity compression quant_method (`str`, *optional*, defaults to `"compressed-tensors"`): do not override, should be compressed-tensors - version (`Optional`, *optional*): """ QUANTIZATION_NAME = "compressed-tensors" @@ -1105,7 +1104,6 @@ def __init__( ignore: Optional[List[str]] = None, sparsity_config: Dict[str, Any] = None, quant_method: str = "compressed-tensors", - version: Optional[str] = None, **kwargs, ): from compressed_tensors import QuantizationConfig @@ -1125,7 +1123,6 @@ def __init__( "kv_cache_scheme": kv_cache_scheme, "global_compression_ratio": global_compression_ratio, "ignore": ignore, - "version": version, **kwargs, } ) From dca9c55546dde27b8c711863fb4c41e37ce53f06 Mon Sep 17 00:00:00 2001 From: George Date: Wed, 13 Nov 2024 18:05:02 +0000 Subject: [PATCH 9/9] test --- src/transformers/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 1be23698066d3a..e3cebabecd0b97 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -532,7 +532,8 @@ def __init__( ) # At this stage the model is already loaded - if _is_quantized_and_base_model and not _is_peft_model(model) and not _is_model_quantized_and_trainable: + # if _is_quantized_and_base_model and not _is_peft_model(model) and not _is_model_quantized_and_trainable: + if _is_quantized_and_base_model and not _is_peft_model(model): raise ValueError( "You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of" " the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft"