From 4b0418df11886547e2c701cc4504627881397a0b Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 13 Sep 2024 12:58:38 +0200 Subject: [PATCH 01/67] Enable `padding_side` as call time kwargs (#33385) * fix * add padding-side kwarg * add padding side in all models & fix tests * fix copies * fix tests --- .../layoutlmv2/tokenization_layoutlmv2.py | 29 +++++++++-- .../tokenization_layoutlmv2_fast.py | 22 +++++++-- .../layoutlmv3/tokenization_layoutlmv3.py | 29 +++++++++-- .../tokenization_layoutlmv3_fast.py | 22 +++++++-- .../layoutxlm/tokenization_layoutxlm.py | 23 +++++++-- .../layoutxlm/tokenization_layoutxlm_fast.py | 18 +++++-- .../models/led/tokenization_led.py | 2 + .../models/led/tokenization_led_fast.py | 2 + .../models/luke/tokenization_luke.py | 29 +++++++++-- .../models/markuplm/tokenization_markuplm.py | 29 +++++++++-- .../markuplm/tokenization_markuplm_fast.py | 22 +++++++-- .../models/mluke/tokenization_mluke.py | 29 +++++++++-- .../models/roc_bert/tokenization_roc_bert.py | 17 +++++-- .../models/tapas/tokenization_tapas.py | 27 ++++++++-- .../models/udop/tokenization_udop.py | 27 ++++++++-- .../models/udop/tokenization_udop_fast.py | 22 +++++++-- .../models/wav2vec2/tokenization_wav2vec2.py | 6 +++ src/transformers/tokenization_utils.py | 7 +++ src/transformers/tokenization_utils_base.py | 37 ++++++++++++-- src/transformers/tokenization_utils_fast.py | 10 +++- .../test_tokenization_layoutlmv2.py | 44 ++++++++++------- .../test_tokenization_layoutlmv3.py | 44 ++++++++++------- .../layoutxlm/test_tokenization_layoutxlm.py | 44 ++++++++++------- .../markuplm/test_tokenization_markuplm.py | 44 ++++++++++------- tests/models/tapas/test_tokenization_tapas.py | 43 +++++++++------- tests/test_tokenization_common.py | 49 ++++++++++++------- 26 files changed, 528 insertions(+), 149 deletions(-) diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py index fe0305562374d7..c5ec79666deede 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py @@ -414,6 +414,7 @@ def __call__( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -517,6 +518,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -539,6 +541,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -567,6 +570,7 @@ def batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -598,6 +602,7 @@ def batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -625,6 +630,7 @@ def _batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -653,6 +659,7 @@ def _batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -677,6 +684,7 @@ def _batch_prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -708,6 +716,7 @@ def _batch_prepare_for_model( max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -728,6 +737,7 @@ def _batch_prepare_for_model( padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -748,6 +758,7 @@ def encode( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -769,6 +780,7 @@ def encode( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -795,6 +807,7 @@ def encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -838,6 +851,7 @@ def encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -861,6 +875,7 @@ def _encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -891,6 +906,7 @@ def _encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -914,6 +930,7 @@ def prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1100,6 +1117,7 @@ def prepare_for_model( max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1243,6 +1261,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1265,6 +1284,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1288,7 +1310,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1302,7 +1325,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1317,7 +1340,7 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py index aa2bf6b3226b18..a666e3d4ea1a43 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py @@ -165,6 +165,7 @@ def __call__( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -268,6 +269,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -290,6 +292,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -318,6 +321,7 @@ def batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -349,6 +353,7 @@ def batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -381,6 +386,7 @@ def encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -424,6 +430,7 @@ def encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -451,6 +458,7 @@ def _batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -470,6 +478,7 @@ def _batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if is_pair: @@ -603,6 +612,7 @@ def _encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -631,6 +641,7 @@ def _encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -663,6 +674,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -685,6 +697,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -708,7 +723,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -722,7 +738,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -737,7 +753,7 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py index 89f899f22f4ecc..248a299c141fd5 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py @@ -543,6 +543,7 @@ def __call__( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -646,6 +647,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -668,6 +670,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -697,6 +700,7 @@ def batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -728,6 +732,7 @@ def batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -756,6 +761,7 @@ def _batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -784,6 +790,7 @@ def _batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -809,6 +816,7 @@ def _batch_prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -840,6 +848,7 @@ def _batch_prepare_for_model( max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -860,6 +869,7 @@ def _batch_prepare_for_model( padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -881,6 +891,7 @@ def encode( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -902,6 +913,7 @@ def encode( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -929,6 +941,7 @@ def encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -972,6 +985,7 @@ def encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -996,6 +1010,7 @@ def _encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1026,6 +1041,7 @@ def _encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -1049,6 +1065,7 @@ def prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1237,6 +1254,7 @@ def prepare_for_model( max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1382,6 +1400,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1404,6 +1423,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1427,7 +1449,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1441,7 +1464,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1456,6 +1479,6 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py index 07bedf36133ad8..63cd1022e52170 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py @@ -217,6 +217,7 @@ def __call__( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -320,6 +321,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -342,6 +344,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -371,6 +374,7 @@ def batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -402,6 +406,7 @@ def batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -436,6 +441,7 @@ def encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -479,6 +485,7 @@ def encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -506,6 +513,7 @@ def _batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -525,6 +533,7 @@ def _batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if is_pair: @@ -664,6 +673,7 @@ def _encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -692,6 +702,7 @@ def _encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -725,6 +736,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -747,6 +759,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -770,7 +785,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -784,7 +800,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -799,7 +815,7 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py index 3ab57ac892aa73..248f16af8441c1 100644 --- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py +++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py @@ -447,6 +447,7 @@ def __call__( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -550,6 +551,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -572,6 +574,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -599,6 +602,7 @@ def _batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -627,6 +631,7 @@ def _batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -651,6 +656,7 @@ def _batch_prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -682,6 +688,7 @@ def _batch_prepare_for_model( max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -702,6 +709,7 @@ def _batch_prepare_for_model( padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -721,6 +729,7 @@ def _encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -751,6 +760,7 @@ def _encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -774,6 +784,7 @@ def prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -947,6 +958,7 @@ def prepare_for_model( max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1090,6 +1102,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1112,6 +1125,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1135,7 +1151,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1149,7 +1166,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1164,6 +1181,6 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py index 6d68cb9f18e7d6..7d12cec496ea30 100644 --- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py +++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py @@ -277,6 +277,7 @@ def __call__( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -380,6 +381,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -402,6 +404,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -442,6 +445,7 @@ def _batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -462,6 +466,7 @@ def _batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if is_pair: @@ -595,6 +600,7 @@ def _encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -623,6 +629,7 @@ def _encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -655,6 +662,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -677,6 +685,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -700,7 +711,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -714,7 +726,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -729,7 +741,7 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py index aaf09e6d149eb1..6c1ec9526aefbf 100644 --- a/src/transformers/models/led/tokenization_led.py +++ b/src/transformers/models/led/tokenization_led.py @@ -412,6 +412,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: encoded_inputs = super()._pad( @@ -419,6 +420,7 @@ def _pad( max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py index ca15eb997bed5b..6ee69fbe792752 100644 --- a/src/transformers/models/led/tokenization_led_fast.py +++ b/src/transformers/models/led/tokenization_led_fast.py @@ -288,6 +288,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: encoded_inputs = super()._pad( @@ -295,6 +296,7 @@ def _pad( max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py index 1a570992ffb406..e06b9c753fe596 100644 --- a/src/transformers/models/luke/tokenization_luke.py +++ b/src/transformers/models/luke/tokenization_luke.py @@ -570,6 +570,7 @@ def __call__( stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -662,6 +663,7 @@ def __call__( stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -688,6 +690,7 @@ def __call__( stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -715,6 +718,7 @@ def _encode_plus( stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -769,6 +773,7 @@ def _encode_plus( max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -796,6 +801,7 @@ def _batch_encode_plus( stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -876,6 +882,7 @@ def _batch_encode_plus( max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -1070,6 +1077,7 @@ def _batch_prepare_for_model( max_entity_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1112,6 +1120,7 @@ def _batch_prepare_for_model( max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -1132,6 +1141,7 @@ def _batch_prepare_for_model( padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1155,6 +1165,7 @@ def prepare_for_model( max_entity_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1357,6 +1368,7 @@ def prepare_for_model( max_entity_length=max_entity_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1382,6 +1394,7 @@ def pad( max_length: Optional[int] = None, max_entity_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, @@ -1418,6 +1431,9 @@ def pad( pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention @@ -1495,6 +1511,7 @@ def pad( max_entity_length=max_entity_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) @@ -1519,6 +1536,7 @@ def pad( max_entity_length=max_entity_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1536,6 +1554,7 @@ def _pad( max_entity_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1562,6 +1581,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1600,9 +1622,10 @@ def _pad( if needs_to_be_padded: difference = max_length - len(encoded_inputs["input_ids"]) + padding_side = padding_side if padding_side is not None else self.padding_side if entities_provided: entity_difference = max_entity_length - len(encoded_inputs["entity_ids"]) - if self.padding_side == "right": + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if entities_provided: @@ -1633,7 +1656,7 @@ def _pad( encoded_inputs["entity_end_positions"] + [0] * entity_difference ) - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if entities_provided: @@ -1664,7 +1687,7 @@ def _pad( "entity_end_positions" ] else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py index c77865abc934c9..e5de1e4e765c93 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm.py +++ b/src/transformers/models/markuplm/tokenization_markuplm.py @@ -503,6 +503,7 @@ def __call__( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -602,6 +603,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -624,6 +626,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -652,6 +655,7 @@ def batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -683,6 +687,7 @@ def batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -710,6 +715,7 @@ def _batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -738,6 +744,7 @@ def _batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -762,6 +769,7 @@ def _batch_prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -793,6 +801,7 @@ def _batch_prepare_for_model( max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -813,6 +822,7 @@ def _batch_prepare_for_model( padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -833,6 +843,7 @@ def encode( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -854,6 +865,7 @@ def encode( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -880,6 +892,7 @@ def encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -923,6 +936,7 @@ def encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -946,6 +960,7 @@ def _encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -976,6 +991,7 @@ def _encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -999,6 +1015,7 @@ def prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1203,6 +1220,7 @@ def prepare_for_model( max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1357,6 +1375,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1376,6 +1395,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1399,7 +1421,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1419,7 +1442,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1440,6 +1463,6 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py index ff0e4ffeb56e9f..796459876425b4 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py +++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py @@ -286,6 +286,7 @@ def __call__( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -385,6 +386,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -407,6 +409,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -435,6 +438,7 @@ def batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -466,6 +470,7 @@ def batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -498,6 +503,7 @@ def encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -541,6 +547,7 @@ def encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -568,6 +575,7 @@ def _batch_encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -587,6 +595,7 @@ def _batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if is_pair: @@ -721,6 +730,7 @@ def _encode_plus( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -749,6 +759,7 @@ def _encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -781,6 +792,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -800,6 +812,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -823,7 +838,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -843,7 +859,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -864,7 +880,7 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py index 3ac8191402af90..f087c0d92fc63f 100644 --- a/src/transformers/models/mluke/tokenization_mluke.py +++ b/src/transformers/models/mluke/tokenization_mluke.py @@ -399,6 +399,7 @@ def __call__( stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -491,6 +492,7 @@ def __call__( stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -517,6 +519,7 @@ def __call__( stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -545,6 +548,7 @@ def _encode_plus( stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -599,6 +603,7 @@ def _encode_plus( max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -627,6 +632,7 @@ def _batch_encode_plus( stride: int = 0, is_split_into_words: Optional[bool] = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -707,6 +713,7 @@ def _batch_encode_plus( max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -904,6 +911,7 @@ def _batch_prepare_for_model( max_entity_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -946,6 +954,7 @@ def _batch_prepare_for_model( max_entity_length=max_entity_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -966,6 +975,7 @@ def _batch_prepare_for_model( padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -990,6 +1000,7 @@ def prepare_for_model( max_entity_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1192,6 +1203,7 @@ def prepare_for_model( max_entity_length=max_entity_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1218,6 +1230,7 @@ def pad( max_length: Optional[int] = None, max_entity_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, @@ -1254,6 +1267,9 @@ def pad( pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention @@ -1331,6 +1347,7 @@ def pad( max_entity_length=max_entity_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) @@ -1355,6 +1372,7 @@ def pad( max_entity_length=max_entity_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1373,6 +1391,7 @@ def _pad( max_entity_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1399,6 +1418,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1437,9 +1459,10 @@ def _pad( if needs_to_be_padded: difference = max_length - len(encoded_inputs["input_ids"]) + padding_side = padding_side if padding_side is not None else self.padding_side if entities_provided: entity_difference = max_entity_length - len(encoded_inputs["entity_ids"]) - if self.padding_side == "right": + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if entities_provided: @@ -1470,7 +1493,7 @@ def _pad( encoded_inputs["entity_end_positions"] + [0] * entity_difference ) - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if entities_provided: @@ -1501,7 +1524,7 @@ def _pad( "entity_end_positions" ] else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py index eaf2a1a491335d..3a980c0ae66f68 100644 --- a/src/transformers/models/roc_bert/tokenization_roc_bert.py +++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py @@ -210,6 +210,7 @@ def _encode_plus( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -283,6 +284,7 @@ def get_input_ids(text): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -308,6 +310,7 @@ def prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -462,6 +465,7 @@ def prepare_for_model( max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -480,6 +484,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: # Load from model defaults @@ -502,8 +507,9 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) + padding_side = padding_side if padding_side is not None else self.padding_side - if self.padding_side == "right": + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -516,7 +522,7 @@ def _pad( if key in encoded_inputs: encoded_inputs[key] = encoded_inputs[key] + [self.pad_token_id] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -530,7 +536,7 @@ def _pad( encoded_inputs[key] = [self.pad_token_id] * difference + encoded_inputs[key] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs @@ -551,6 +557,7 @@ def _batch_encode_plus( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -627,6 +634,7 @@ def get_input_ids(text): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -650,6 +658,7 @@ def _batch_prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -686,6 +695,7 @@ def _batch_prepare_for_model( max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -706,6 +716,7 @@ def _batch_prepare_for_model( padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index 2da9fe40c1ce88..867e53ff89078a 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -517,6 +517,7 @@ def __call__( truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -581,6 +582,7 @@ def __call__( truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -602,6 +604,7 @@ def __call__( truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -631,6 +634,7 @@ def batch_encode_plus( truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -699,6 +703,7 @@ def batch_encode_plus( truncation=truncation, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -738,6 +743,7 @@ def _batch_encode_plus( truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = True, return_attention_mask: Optional[bool] = None, @@ -768,6 +774,7 @@ def _batch_encode_plus( add_special_tokens=add_special_tokens, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -797,6 +804,7 @@ def _batch_prepare_for_model( truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = True, return_attention_mask: Optional[bool] = True, @@ -823,6 +831,7 @@ def _batch_prepare_for_model( truncation=truncation, max_length=max_length, pad_to_multiple_of=None, # we pad in batch afterwards + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterwards return_token_type_ids=return_token_type_ids, return_special_tokens_mask=return_special_tokens_mask, @@ -844,6 +853,7 @@ def _batch_prepare_for_model( padding=padding, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -912,6 +922,7 @@ def encode_plus( truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -968,6 +979,7 @@ def encode_plus( padding=padding, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -993,6 +1005,7 @@ def _encode_plus( truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = True, return_attention_mask: Optional[bool] = True, @@ -1024,6 +1037,7 @@ def _encode_plus( padding=padding, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -1051,6 +1065,7 @@ def prepare_for_model( truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = True, return_attention_mask: Optional[bool] = True, @@ -1214,6 +1229,7 @@ def prepare_for_model( max_length=max_length, padding=padding.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1754,6 +1770,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1776,6 +1793,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1799,7 +1819,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(encoded_inputs["input_ids"]) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1817,7 +1838,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1836,7 +1857,7 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/udop/tokenization_udop.py b/src/transformers/models/udop/tokenization_udop.py index 4be9799819168c..e40c07a58aceb7 100644 --- a/src/transformers/models/udop/tokenization_udop.py +++ b/src/transformers/models/udop/tokenization_udop.py @@ -551,6 +551,7 @@ def call_boxes( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -654,6 +655,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -676,6 +678,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -704,6 +707,7 @@ def batch_encode_plus_boxes( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -746,6 +750,7 @@ def batch_encode_plus_boxes( stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -813,6 +818,7 @@ def encode_plus_boxes( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -865,6 +871,7 @@ def encode_plus_boxes( stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -892,6 +899,7 @@ def _batch_encode_plus_boxes( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -920,6 +928,7 @@ def _batch_encode_plus_boxes( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -944,6 +953,7 @@ def _batch_prepare_for_model_boxes( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -975,6 +985,7 @@ def _batch_prepare_for_model_boxes( max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -995,6 +1006,7 @@ def _batch_prepare_for_model_boxes( padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1014,6 +1026,7 @@ def _encode_plus_boxes( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1044,6 +1057,7 @@ def _encode_plus_boxes( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -1067,6 +1081,7 @@ def prepare_for_model_boxes( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -1240,6 +1255,7 @@ def prepare_for_model_boxes( max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -1385,6 +1401,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -1407,6 +1424,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -1430,7 +1450,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -1444,7 +1465,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -1459,6 +1480,6 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/udop/tokenization_udop_fast.py b/src/transformers/models/udop/tokenization_udop_fast.py index 8340c4af4e2bb7..8ee0577fa10e58 100644 --- a/src/transformers/models/udop/tokenization_udop_fast.py +++ b/src/transformers/models/udop/tokenization_udop_fast.py @@ -286,6 +286,7 @@ def call_boxes( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -389,6 +390,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -411,6 +413,7 @@ def _is_valid_text_input(t): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -453,6 +456,7 @@ def batch_encode_plus_boxes( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -501,6 +505,7 @@ def batch_encode_plus_boxes( stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -528,6 +533,7 @@ def _batch_encode_plus_boxes( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -548,6 +554,7 @@ def _batch_encode_plus_boxes( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if is_pair: @@ -684,6 +691,7 @@ def _encode_plus_boxes( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -712,6 +720,7 @@ def _encode_plus_boxes( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -794,6 +803,7 @@ def encode_plus_boxes( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -846,6 +856,7 @@ def encode_plus_boxes( stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -864,6 +875,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -886,6 +898,9 @@ def _pad( pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -909,7 +924,8 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) - if self.padding_side == "right": + padding_side = padding_side if padding_side is not None else self.padding_side + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -923,7 +939,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -938,7 +954,7 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(padding_side)) return encoded_inputs diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index 647b18521d0515..c1a333fe48c6b4 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -781,6 +781,7 @@ def __call__( padding: Union[bool, str, PaddingStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, **kwargs, @@ -794,6 +795,10 @@ def __call__( The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float values, a list of numpy array or a list of list of float values. Must be mono channel audio, not stereo, i.e. single float per timestep. + + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. """ is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 @@ -825,6 +830,7 @@ def __call__( padding=padding, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=self.return_attention_mask, return_tensors=return_tensors, verbose=verbose, diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index f04eaae4525de9..6a5bff3679f8aa 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -749,6 +749,7 @@ def _encode_plus( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -806,6 +807,7 @@ def get_input_ids(text): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, prepend_batch_axis=True, return_attention_mask=return_attention_mask, @@ -833,6 +835,7 @@ def _batch_encode_plus( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -891,6 +894,7 @@ def get_input_ids(text): max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -913,6 +917,7 @@ def _batch_prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -942,6 +947,7 @@ def _batch_prepare_for_model( max_length=max_length, stride=stride, pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, @@ -963,6 +969,7 @@ def _batch_prepare_for_model( padding=padding_strategy.value, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 5e9170456a07ea..93dea5ba09de36 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1427,6 +1427,9 @@ def all_special_ids(self) -> List[int]: If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors instead of list of python integers. Acceptable values are: @@ -2767,6 +2770,7 @@ def encode( truncation: Union[bool, str, TruncationStrategy] = None, max_length: Optional[int] = None, stride: int = 0, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs, ) -> List[int]: @@ -2793,6 +2797,7 @@ def encode( truncation=truncation, max_length=max_length, stride=stride, + padding_side=padding_side, return_tensors=return_tensors, **kwargs, ) @@ -2956,6 +2961,7 @@ def __call__( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -2997,6 +3003,7 @@ def __call__( "stride": stride, "is_split_into_words": is_split_into_words, "pad_to_multiple_of": pad_to_multiple_of, + "padding_side": padding_side, "return_tensors": return_tensors, "return_token_type_ids": return_token_type_ids, "return_attention_mask": return_attention_mask, @@ -3041,6 +3048,7 @@ def _call_one( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3111,6 +3119,7 @@ def _is_valid_text_input(t): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -3133,6 +3142,7 @@ def _is_valid_text_input(t): stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -3157,6 +3167,7 @@ def encode_plus( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3207,6 +3218,7 @@ def encode_plus( stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -3230,6 +3242,7 @@ def _encode_plus( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3261,6 +3274,7 @@ def batch_encode_plus( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3307,6 +3321,7 @@ def batch_encode_plus( stride=stride, is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -3336,6 +3351,7 @@ def _batch_encode_plus( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3361,6 +3377,7 @@ def pad( padding: Union[bool, str, PaddingStrategy] = True, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, @@ -3409,6 +3426,9 @@ def pad( This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask (`bool`, *optional*): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the `return_outputs` attribute. @@ -3491,6 +3511,7 @@ def pad( max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) @@ -3512,6 +3533,7 @@ def pad( max_length=max_length, padding_strategy=padding_strategy, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -3573,6 +3595,7 @@ def prepare_for_model( max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -3686,6 +3709,7 @@ def prepare_for_model( max_length=max_length, padding=padding_strategy.value, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_attention_mask=return_attention_mask, ) @@ -3828,6 +3852,7 @@ def _pad( max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_attention_mask: Optional[bool] = None, ) -> dict: """ @@ -3843,13 +3868,16 @@ def _pad( - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - PaddingStrategy.DO_NOT_PAD: Do not pad - The tokenizer padding sides are defined in self.padding_side: + The tokenizer padding sides are defined in `padding_side` argument: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) """ @@ -3873,8 +3901,9 @@ def _pad( if needs_to_be_padded: difference = max_length - len(required_input) + padding_side = padding_side if padding_side is not None else self.padding_side - if self.padding_side == "right": + if padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: @@ -3884,7 +3913,7 @@ def _pad( if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference - elif self.padding_side == "left": + elif padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] if "token_type_ids" in encoded_inputs: @@ -3895,7 +3924,7 @@ def _pad( encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input else: - raise ValueError(f"Invalid padding strategy:{self.padding_side}") + raise ValueError(f"Invalid padding strategy:{padding_side}") return encoded_inputs diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 7d5446d7cbf233..724484b3b30b88 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -429,6 +429,7 @@ def set_truncation_and_padding( max_length: int, stride: int, pad_to_multiple_of: Optional[int], + padding_side: Optional[bool], ): """ Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers @@ -450,6 +451,9 @@ def set_truncation_and_padding( pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. """ _truncation = self._tokenizer.truncation _padding = self._tokenizer.padding @@ -484,7 +488,7 @@ def set_truncation_and_padding( length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None target = { "length": length, - "direction": self.padding_side, + "direction": padding_side if padding_side is not None else self.padding_side, "pad_id": self.pad_token_id, "pad_token": self.pad_token, "pad_type_id": self.pad_token_type_id, @@ -505,6 +509,7 @@ def _batch_encode_plus( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[str] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -527,6 +532,7 @@ def _batch_encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, ) if self._tokenizer.encode_special_tokens != split_special_tokens: @@ -593,6 +599,7 @@ def _encode_plus( stride: int = 0, is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[bool] = None, return_tensors: Optional[bool] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -614,6 +621,7 @@ def _encode_plus( max_length=max_length, stride=stride, pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, return_tensors=return_tensors, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py index bb526e140e5740..19a6aeec46f935 100644 --- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py @@ -21,6 +21,8 @@ import unittest from typing import List +from parameterized import parameterized + from transformers import ( AddedToken, LayoutLMv2TokenizerFast, @@ -393,7 +395,8 @@ def test_right_and_left_truncation(self): def test_split_special_tokens(self): pass - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -444,15 +447,18 @@ def test_encode_plus_with_padding(self): self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" - right_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -463,14 +469,18 @@ def test_encode_plus_with_padding(self): self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index 5ea384f0b26422..007e23430b3a56 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -22,6 +22,8 @@ import unittest from typing import List +from parameterized import parameterized + from transformers import ( AddedToken, LayoutLMv3TokenizerFast, @@ -273,7 +275,8 @@ def test_right_and_left_truncation(self): def test_split_special_tokens(self): pass - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -324,15 +327,18 @@ def test_encode_plus_with_padding(self): self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" - right_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -343,14 +349,18 @@ def test_encode_plus_with_padding(self): self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py index c0e44fcb30491f..8acd3716cf576b 100644 --- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py +++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -19,6 +19,8 @@ import unittest from typing import List +from parameterized import parameterized + from transformers import ( AddedToken, LayoutXLMTokenizerFast, @@ -324,7 +326,8 @@ def test_encode_decode_with_spaces(self): decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) self.assertIn(decoded, [output, output.lower()]) - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -375,15 +378,18 @@ def test_encode_plus_with_padding(self): self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" - right_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -394,14 +400,18 @@ def test_encode_plus_with_padding(self): self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - words, - boxes=boxes, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py index 458df94ec2fbcc..fcdde2eb8a874b 100644 --- a/tests/models/markuplm/test_tokenization_markuplm.py +++ b/tests/models/markuplm/test_tokenization_markuplm.py @@ -22,6 +22,8 @@ import unittest from typing import List +from parameterized import parameterized + from transformers import ( AddedToken, MarkupLMTokenizerFast, @@ -211,7 +213,8 @@ def test_encode_decode_with_spaces(self): def test_right_and_left_truncation(self): pass - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -262,15 +265,18 @@ def test_encode_plus_with_padding(self): self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask) # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" - right_padded_sequence = tokenizer.encode_plus( - nodes, - xpaths=xpaths, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + right_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -281,14 +287,18 @@ def test_encode_plus_with_padding(self): self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask) # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - nodes, - xpaths=xpaths, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py index a9b8e9a0c77fa6..49327a39cd80d3 100644 --- a/tests/models/tapas/test_tokenization_tapas.py +++ b/tests/models/tapas/test_tokenization_tapas.py @@ -21,6 +21,7 @@ import numpy as np import pandas as pd +from parameterized import parameterized from transformers import AddedToken, is_torch_available from transformers.models.tapas.tokenization_tapas import ( @@ -494,7 +495,8 @@ def test_encode_decode_with_spaces(self): decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens) self.assertIn(decoded, [output, output.lower()]) - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -547,15 +549,18 @@ def test_encode_plus_with_padding(self): assert special_tokens_mask == not_padded_special_tokens_mask # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" - right_padded_sequence = tokenizer.encode_plus( - table, - sequence, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + right_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -566,14 +571,18 @@ def test_encode_plus_with_padding(self): assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - table, - sequence, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 64c860e3fc177d..342254dfbdf066 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -2225,7 +2225,15 @@ def test_padding_with_attention_mask(self): else: self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]]) - def test_encode_plus_with_padding(self): + @parameterized.expand([(True,), (False,)]) + def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool): + """ + This test checks that padding works as expected when tokenizing a sequence. + Padding is expected to have no effect when the input is a single sequence and + the padding-strategy is not `max_length`. Otherwise it pads to the specified max-length + using tokenizer classes `padding_side` attribute. Also, we check that passing `padding_side` + as call time kwarg works same way as when one sets `tokenizer.padding_side` attribute. + """ tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -2244,8 +2252,6 @@ def test_encode_plus_with_padding(self): sequence_length = len(input_ids) # Test 'longest' and 'no_padding' don't do anything - tokenizer.padding_side = "right" - not_padded_sequence = tokenizer.encode_plus( sequence, padding=True, @@ -2275,14 +2281,18 @@ def test_encode_plus_with_padding(self): self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask) # Test right padding - tokenizer.padding_side = "right" + tokenizer_kwargs_right = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } - right_padded_sequence = tokenizer.encode_plus( - sequence, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "right" + else: + tokenizer_kwargs_right["padding_side"] = "right" + + right_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_right) right_padded_input_ids = right_padded_sequence["input_ids"] right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"] @@ -2293,13 +2303,18 @@ def test_encode_plus_with_padding(self): self.assertEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask) # Test left padding - tokenizer.padding_side = "left" - left_padded_sequence = tokenizer.encode_plus( - sequence, - max_length=sequence_length + padding_size, - padding="max_length", - return_special_tokens_mask=True, - ) + tokenizer_kwargs_left = { + "max_length": sequence_length + padding_size, + "padding": "max_length", + "return_special_tokens_mask": True, + } + + if not use_padding_as_call_kwarg: + tokenizer.padding_side = "left" + else: + tokenizer_kwargs_left["padding_side"] = "left" + + left_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_left) left_padded_input_ids = left_padded_sequence["input_ids"] left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"] left_padded_sequence_length = len(left_padded_input_ids) From 7a5659872a68ce9939c975b5727e5ac61136f256 Mon Sep 17 00:00:00 2001 From: Alvaro Moran <6949769+tengomucho@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:19:06 +0200 Subject: [PATCH 02/67] Mitigate a conflict when using sentencepiece (#33327) * test(tokenizers): add a test showing conflict with sentencepiece This is due to the fact that protobuf C implementation uses a global pool for all added descriptors, so if two different files add descriptors, they will end up conflicting. * fix(tokenizers): mitigate sentencepiece/protobuf conflict When sentencepiece is available, use that protobuf instead of the internal one. * chore(style): fix with ruff --- src/transformers/convert_slow_tokenizer.py | 6 +++++- tests/tokenization/test_tokenization_utils.py | 20 ++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index f2064a131dad42..eb75a46a6d9bf2 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -26,7 +26,7 @@ from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece -from .utils import is_protobuf_available, logging, requires_backends +from .utils import is_protobuf_available, is_sentencepiece_available, logging, requires_backends from .utils.import_utils import PROTOBUF_IMPORT_ERROR @@ -34,6 +34,10 @@ def import_protobuf(error_message=""): + if is_sentencepiece_available(): + from sentencepiece import sentencepiece_model_pb2 + + return sentencepiece_model_pb2 if is_protobuf_available(): import google.protobuf diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py index f97ef6a630221d..b43923df84d712 100644 --- a/tests/tokenization/test_tokenization_utils.py +++ b/tests/tokenization/test_tokenization_utils.py @@ -35,7 +35,15 @@ is_tokenizers_available, ) from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer -from transformers.testing_utils import CaptureStderr, require_flax, require_tf, require_tokenizers, require_torch, slow +from transformers.testing_utils import ( + CaptureStderr, + require_flax, + require_sentencepiece, + require_tf, + require_tokenizers, + require_torch, + slow, +) if is_tokenizers_available(): @@ -296,3 +304,13 @@ def test_len_tokenizer(self): self.assertEqual(len(tokenizer), tokenizer.vocab_size + 1) self.assertEqual(len(tokenizer.added_tokens_decoder), added_tokens_size + 1) self.assertEqual(len(tokenizer.added_tokens_encoder), added_tokens_size + 1) + + @require_sentencepiece + def test_sentencepiece_cohabitation(self): + from sentencepiece import sentencepiece_model_pb2 as _original_protobuf # noqa: F401 + + from transformers.convert_slow_tokenizer import import_protobuf # noqa: F401 + + # Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf + # was already imported. + import_protobuf() From dfd31158eefab01952e729588a37c9fcc81f0813 Mon Sep 17 00:00:00 2001 From: Amit Garg Date: Fri, 13 Sep 2024 05:07:19 -0700 Subject: [PATCH 03/67] [Phi-3] Bug on stale kv cache (#33129) * fix long seq bug * fixed format * fixed fn copy inconsistency * fix long seq bug * fixed format * fixed fn copy inconsistency * Addressed comments * added a unit test * fixed cache position * Added a warning msg to the forward fn * fixed test case --- src/transformers/models/phi3/modeling_phi3.py | 23 ++++++++++- tests/models/phi3/test_modeling_phi3.py | 41 +++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index f021c6ce2d339d..273b6a8f505e79 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -257,7 +257,7 @@ def __init__(self, dim, config, device=None): @torch.no_grad() def forward(self, x, position_ids, seq_len=None): - seq_len = torch.max(position_ids) + 1 + seq_len = seq_len or torch.max(position_ids) + 1 if seq_len > self.original_max_position_embeddings: ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device) else: @@ -1239,6 +1239,15 @@ def forward( >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum' ```""" + if ( + use_cache + and self.config.rope_scaling + and cache_position is not None + and cache_position[0] == self.config.original_max_position_embeddings + ): + logger.warning( + f"If you are not using the generate method, you may encounter nonsensical outputs after the {self.config.original_max_position_embeddings}th token, as the KV cache needs to be recomputed." + ) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -1295,7 +1304,6 @@ def forward( attentions=outputs.attentions, ) - # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation def prepare_inputs_for_generation( self, input_ids, @@ -1308,6 +1316,17 @@ def prepare_inputs_for_generation( num_logits_to_keep=None, **kwargs, ): + # When the first time input length reached long and short factor switching point, enforce re-compute cache + # It will cause downside of slower at this single token position, however, better than current failure. + if ( + past_key_values + and self.config.rope_scaling + and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1 + ): + past_length = cache_position[0] + if past_length <= self.config.original_max_position_embeddings: + past_key_values = None + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py index a3f001aba467a0..ce0a71878877b5 100644 --- a/tests/models/phi3/test_modeling_phi3.py +++ b/tests/models/phi3/test_modeling_phi3.py @@ -442,6 +442,47 @@ def test_model_rope_scaling_from_config(self, scaling_type): self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) + @parameterized.expand([("longrope",)]) + def test_model_rope_scaling_short_long_factor(self, scaling_type): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + n_factors = config.hidden_size // config.num_key_value_heads // 2 + config.rope_scaling = { + "type": scaling_type, + "short_factor": [3.0 for _ in range(n_factors)], + "long_factor": [5.0 for _ in range(n_factors)], + } + input_tensor = ids_tensor([1, 4090], config.vocab_size) + model = Phi3ForCausalLM(config) + model.to(torch_device) + model.eval() + generation_args_short = { + "max_length": config.original_max_position_embeddings, + "temperature": 0.0, + "use_cache": True, + "do_sample": False, + "return_dict_in_generate": True, + } + output_with_short_factor = model.generate(input_tensor, **generation_args_short) + keys_with_short_factor = output_with_short_factor.past_key_values[0][0] + generation_args_long = { + "max_length": config.original_max_position_embeddings + 5, + "temperature": 0.0, + "use_cache": True, + "do_sample": False, + "return_dict_in_generate": True, + "output_logits": True, + } + output_with_long_factor = model.generate(input_tensor, **generation_args_long) + keys_with_long_factor = output_with_long_factor.past_key_values[0][0] + last_token_logits = output_with_long_factor.logits[-1][-1] + regenerated_last_token_logits = model(output_with_long_factor.sequences[:, :-1]).logits[0][-1] + keys_with_long_factor = keys_with_long_factor[:, :, : config.original_max_position_embeddings - 1, :] + + # KV cache is re-computed after reaching the (`config.original_max_position_embeddings`+1)th token position + self.assertFalse(torch.allclose(keys_with_short_factor, keys_with_long_factor, atol=1e-2, rtol=1e-2)) + # Last token generated using long factor + self.assertTrue(torch.allclose(last_token_logits, regenerated_last_token_logits, atol=1e-2, rtol=1e-2)) + @slow @require_torch From 6cc4dfe3f1e8d421c6d6351388e06e9b123cbfe1 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Fri, 13 Sep 2024 15:06:08 +0200 Subject: [PATCH 04/67] Fix the initialization of the cache when we have multi gpu (#33303) * init cache multi-gpu * Update src/transformers/generation/utils.py Co-authored-by: Joao Gante * switch to execution device map * naming more consistant * fix * mutually exclusive device * added an integration example * remove useless check * suggestion from joao + typing * fix couple of typo and add test * revert check --------- Co-authored-by: Joao Gante --- src/transformers/cache_utils.py | 40 +++++++++---- src/transformers/generation/utils.py | 27 +++++++++ tests/generation/test_utils.py | 85 ++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+), 11 deletions(-) diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index b3e94da3d7d7bd..0671157e447038 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -1030,6 +1030,9 @@ class StaticCache(Cache): The device on which the cache should be initialized. Should be the same as the layer. dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The default `dtype` to use when initializing the layer. + layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`): + Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus. + You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`. Example: @@ -1060,6 +1063,7 @@ def __init__( device: torch.device = None, dtype: torch.dtype = torch.float32, max_batch_size: Optional[int] = None, + layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, ) -> None: super().__init__() if max_batch_size is not None: @@ -1088,16 +1092,20 @@ def __init__( # Note: There will be significant perf decrease if switching to use 5D tensors instead. cache_shape = (self.batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim) for idx in range(config.num_hidden_layers): - new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device) - new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device) + if layer_device_map is not None: + layer_device = layer_device_map[idx] + else: + layer_device = device + new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device) + new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device) # Notes: # 1. `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph # breaks when updating the cache. It can't be used if the cache code is being compiled (but in that case # it is not needed anyway) # 2. `torch.export()` requires mutations to be registered as buffers. if not is_torchdynamo_compiling(): - self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=device)) - self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=device)) + self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device)) + self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device)) new_layer_key_cache = getattr(self, f"key_cache_{idx}") new_layer_value_cache = getattr(self, f"value_cache_{idx}") torch._dynamo.mark_static_address(new_layer_key_cache) @@ -1130,9 +1138,9 @@ def update( Return: A tuple containing the updated key and value states. """ + cache_position = cache_kwargs.get("cache_position") - self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device=key_states.device) - self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device=value_states.device) + k_out = self.key_cache[layer_idx] v_out = self.value_cache[layer_idx] @@ -1201,6 +1209,9 @@ class SlidingWindowCache(StaticCache): The device on which the cache should be initialized. Should be the same as the layer. dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The default `dtype` to use when initializing the layer. + layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`): + Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus. + You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`. Example: @@ -1231,6 +1242,7 @@ def __init__( device: torch.device = None, dtype: torch.dtype = torch.float32, max_batch_size: Optional[int] = None, + layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, ) -> None: super().__init__() if not hasattr(config, "sliding_window") or config.sliding_window is None: @@ -1247,6 +1259,7 @@ def __init__( device=device, dtype=dtype, max_batch_size=max_batch_size, + layer_device_map=layer_device_map, ) def update( @@ -1280,7 +1293,6 @@ def update( v_out = v_out[:, :, indices] try: - cache_position.to(device=k_out.device) k_out.index_copy_(2, cache_position, key_states) v_out.index_copy_(2, cache_position, value_states) except NotImplementedError: @@ -1495,6 +1507,9 @@ class HybridCache(Cache): The device on which the cache should be initialized. Should be the same as the layer. dtype (torch.dtype, *optional*, defaults to `torch.float32`): The default `dtype` to use when initializing the layer. + layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`): + Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus. + You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`. Example: @@ -1525,6 +1540,7 @@ def __init__( device: Union[torch.device, str] = "cpu", dtype: torch.dtype = torch.float32, max_batch_size: Optional[int] = None, + layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, ) -> None: super().__init__() if max_batch_size is not None: @@ -1562,11 +1578,15 @@ def __init__( self.head_dim, ) for i in range(config.num_hidden_layers): + if layer_device_map is not None: + layer_device = layer_device_map[i] + else: + layer_device = device # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph # breaks when updating the cache. cache_shape = global_cache_shape if not self.is_sliding[i] else sliding_cache_shape - new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device) - new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device) + new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device) + new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device) torch._dynamo.mark_static_address(new_layer_key_cache) torch._dynamo.mark_static_address(new_layer_value_cache) self.key_cache.append(new_layer_key_cache) @@ -1617,8 +1637,6 @@ def update( ) -> Tuple[torch.Tensor]: cache_position = cache_kwargs.get("cache_position") sliding_window = cache_kwargs.get("sliding_window") - self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device=key_states.device) - self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device=value_states.device) k_out = self.key_cache[layer_idx] v_out = self.value_cache[layer_idx] if sliding_window: diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 17a234c62b285e..019eb6c27f18cc 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1446,12 +1446,39 @@ def _get_cache( # models. May cause trobles with non-text modalities. cache_dtype = self.get_output_embeddings().weight.dtype + def get_layer_device_map(execution_device_map: Optional[dict] = None): + if execution_device_map is None or len(execution_device_map) <= 1: + return None + layer_device_map = {} + for layer in execution_device_map: + for idx in range(self.config.num_hidden_layers): + if f".{idx}." in f"{layer}.": + layer_device_map[idx] = execution_device_map[layer] + break + for idx in range(self.config.num_hidden_layers): + if idx not in layer_device_map: + raise RuntimeError(f"layer {idx} has not been mapped to a device.") + return layer_device_map + + execution_device_map = None + # Taken from dispatch_model from accelerate. + # This is needed here if we don't want to make changes in accelerate in order to save execution_device + # For offloaded case, we need to get the execution device, not just the device where it is offloaded + if hasattr(self, "hf_device_map"): + main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0] + execution_device_map = { + name: main_device if device in ["cpu", "disk"] else device + for name, device in self.hf_device_map.items() + } + layer_device_map = get_layer_device_map(execution_device_map) + cache_kwargs = { "config": self.config if hasattr(self.config, "text_config") else self.config, "max_batch_size": batch_size, "max_cache_len": max_cache_len, "device": device, "dtype": cache_dtype, + "layer_device_map": layer_device_map, } self._cache = cache_cls(**cache_kwargs) if requires_cross_attention_cache: diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 65507795c84dd8..0ed054ad58696e 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -3444,6 +3444,91 @@ def test_special_tokens_fall_back_to_model_default(self): self.assertTrue(test_bos_id == gen_output[0, 0]) self.assertTrue(generation_config.bos_token_id is None) + @pytest.mark.generate + @require_torch_multi_gpu + def test_generate_with_static_cache_multi_gpu(self): + """ + Tests if the static cache has been set correctly and if generate works correctly when we are using multi-gpus. + """ + # need to split manually as auto doesn't work well with unbalanced model + device_map = {"model.embed_tokens": 0, "model.layers.0": 0, "model.layers.1": 1, "model.norm": 1, "lm_head": 0} + model = AutoModelForCausalLM.from_pretrained( + "hf-internal-testing/tiny-random-MistralForCausalLM", device_map=device_map + ) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM") + + text = "Hello world" + tokenized_inputs = tokenizer([text], return_tensors="pt") + input_ids = tokenized_inputs.input_ids.to(torch_device) + + generation_kwargs = { + "max_new_tokens": 20, + "cache_implementation": "static", + "return_dict_in_generate": True, # Required to return `past_key_values` + } + + results = model.generate(input_ids, **generation_kwargs) + self.assertTrue(isinstance(results.past_key_values, StaticCache)) + + # check device of each layer + key_cache_0 = results.past_key_values.key_cache[0] + value_cache_0 = results.past_key_values.value_cache[0] + self.assertTrue(key_cache_0.device == value_cache_0.device == torch.device(0)) + + key_cache_1 = results.past_key_values.key_cache[1] + value_cache_1 = results.past_key_values.value_cache[1] + self.assertTrue(key_cache_1.device == value_cache_1.device == torch.device(1)) + + @pytest.mark.generate + @require_torch_multi_gpu + def test_init_static_cache_multi_gpu(self): + """ + Tests if the static cache has been set correctly when we initialize it manually in a multi-gpu setup. + """ + # need to split manually as auto doesn't work well with unbalanced model + device_map = {"model.embed_tokens": 0, "model.layers.0": 0, "model.layers.1": 1, "model.norm": 1, "lm_head": 0} + model = AutoModelForCausalLM.from_pretrained( + "hf-internal-testing/tiny-random-MistralForCausalLM", device_map=device_map + ) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM") + + text = "Hello world" + tokenized_inputs = tokenizer([text], return_tensors="pt") + input_ids = tokenized_inputs.input_ids.to(torch_device) + + generation_kwargs = { + "max_new_tokens": 20, + "return_dict_in_generate": True, # Required to return `past_key_values` + } + + # TODO: We need to raise a warning in case the cache is not set correctly + # with self.assertRaisesRegex(ValueError, "If you are manually initializing the cache"): + # past_key_values = StaticCache( + # config=model.config, batch_size=1, max_cache_len=30, device=torch_device, dtype=model.dtype + # ) + # results = model.generate(input_ids, past_key_values=past_key_values, **generation_kwargs) + + # deduced from the device_map : layer 0 on device 0 and layer 1 on device 1 + layer_device_map = {0: 0, 1: 1} + past_key_values = StaticCache( + config=model.config, + batch_size=1, + max_cache_len=30, + device=torch_device, + dtype=model.dtype, + layer_device_map=layer_device_map, + ) + results = model.generate(input_ids, past_key_values=past_key_values, **generation_kwargs) + + # check device of each layer + key_cache_0 = results.past_key_values.key_cache[0] + value_cache_0 = results.past_key_values.value_cache[0] + self.assertTrue(key_cache_0.device == value_cache_0.device == torch.device(0)) + + key_cache_1 = results.past_key_values.key_cache[1] + value_cache_1 = results.past_key_values.value_cache[1] + self.assertTrue(key_cache_1.device == value_cache_1.device == torch.device(1)) + @require_torch class TokenHealingTestCase(unittest.TestCase): From 0963229e287501bed52ae1dabc17922524de6992 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Fri, 13 Sep 2024 15:07:12 +0200 Subject: [PATCH 05/67] Enable finetuning with torchao quantized model (#33361) enable training --- src/transformers/quantizers/quantizer_torchao.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py index 3b5dfff2090499..02ea8294a2d54a 100644 --- a/src/transformers/quantizers/quantizer_torchao.py +++ b/src/transformers/quantizers/quantizer_torchao.py @@ -166,7 +166,8 @@ def is_serializable(self): @property def is_trainable(self): - # torchao does not have official support for QAT (Quantization Aware Training) - # but torchao support nf4/PEFT, but it is not integrated yet - # TODO: if this is supported in the future, do a version check here. - return False + supported_quant_types_for_training = [ + "int8_weight_only", + "int8_dynamic_activation_int8_weight", + ] + return self.quantization_config.quant_type in supported_quant_types_for_training From e39b6c1c7cdc890b6849b8c9de545fc9590ba871 Mon Sep 17 00:00:00 2001 From: Sergio Paniego Blanco Date: Fri, 13 Sep 2024 17:15:20 +0200 Subject: [PATCH 06/67] Corrected `Agents and tools` documentation links typos (#33471) * Corrected agents task link typo * Corrected chat templating link * Corrected chat templating link 2 --- docs/source/en/agents.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md index b100e39f1c9591..0b889f4eec867b 100644 --- a/docs/source/en/agents.md +++ b/docs/source/en/agents.md @@ -19,7 +19,7 @@ rendered properly in your Markdown viewer. ### What is an agent? -Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling.) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to. +Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to. One approach to overcome this weakness is to create an *agent*. @@ -114,7 +114,7 @@ To start with, please install the `agents` extras in order to install all defaul pip install transformers[agents] ``` -Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating.) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating. +Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating. ```python from huggingface_hub import login, InferenceClient @@ -130,7 +130,7 @@ def llm_engine(messages, stop_sequences=["Task"]) -> str: ``` You could use any `llm_engine` method as long as: -1. it follows the [messages format](./chat_templating.md) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`. +1. it follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`. 2. it stops generating outputs at the sequences passed in the argument `stop_sequences` Additionally, `llm_engine` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to llm_engine, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs. From 7bb1c99800d235791dace10305731f377db8077b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Lemayian=20=E2=9C=A8?= <877919+DavidLemayian@users.noreply.github.com> Date: Sat, 14 Sep 2024 00:25:20 +0300 Subject: [PATCH 07/67] chore: fix typo in comment in tokenization_utils_base.py (#33466) docs: update grammar in comment in tokenization_utils_base.py small grammar update in tokenization_utils_base.py comment --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 93dea5ba09de36..b4490578a70916 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -3457,7 +3457,7 @@ def pad( if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping): encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} - # The model's main input name, usually `input_ids`, has be passed for padding + # The model's main input name, usually `input_ids`, has been passed for padding if self.model_input_names[0] not in encoded_inputs: raise ValueError( "You should supply an encoding or a list of encodings to this method " From 8bd2b1e8c23234cd607ca8d63f53c1edfea27462 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Sat, 14 Sep 2024 12:28:39 +0200 Subject: [PATCH 08/67] Add support for Pixtral (#33449) * initial commit * gloups * updates * work * weights match * nits * nits * updates to support the tokenizer :) * updates * Pixtral processor (#33454) * rough outline * Add in image break and end tokens * Fix * Udo some formatting changes * Set patch_size default * Fix * Fix token expansion * nit in conversion script * Fix image token list creation * done * add expected results * Process list of list of images (#33465) * updates * working image and processor * this is the expected format * some fixes * push current updated * working mult images! * add a small integration test * Uodate configuration docstring * Formatting * Config docstring fix * simplify model test * fixup modeling and etests * Return BatchMixFeature in image processor * fix some copies * update * nits * Update model docstring * Apply suggestions from code review * Fix up * updates * revert modeling changes * update * update * fix load safe * addd liscence * update * use pixel_values as required by the model * skip some tests and refactor * Add pixtral image processing tests (#33476) * Image processing tests * Add processing tests * woops * defaults reflect pixtral image processor * fixup post merge * images -> pixel values * oups sorry Mr docbuilder * isort * fix * fix processor tests * small fixes * nit * update * last nits * oups this was really breaking! * nits * is composition needs to be true --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 1 + docs/source/en/model_doc/pixtral.md | 98 ++++ src/transformers/__init__.py | 13 +- src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 1 + .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + .../models/llava/configuration_llava.py | 2 +- src/transformers/models/pixtral/__init__.py | 70 +++ .../models/pixtral/configuration_pixtral.py | 103 ++++ .../pixtral/convert_pixtral_weights_to_hf.py | 285 ++++++++++ .../pixtral/image_processing_pixtral.py | 519 ++++++++++++++++++ .../models/pixtral/modeling_pixtral.py | 517 +++++++++++++++++ .../models/pixtral/processing_pixtral.py | 282 ++++++++++ src/transformers/utils/dummy_pt_objects.py | 14 + .../utils/dummy_vision_objects.py | 7 + tests/models/llava/test_modeling_llava.py | 47 ++ tests/models/pixtral/__init__.py | 0 .../pixtral/test_image_processing_pixtral.py | 217 ++++++++ tests/models/pixtral/test_modeling_pixtral.py | 292 ++++++++++ .../models/pixtral/test_processor_pixtral.py | 233 ++++++++ 24 files changed, 2707 insertions(+), 2 deletions(-) create mode 100644 docs/source/en/model_doc/pixtral.md create mode 100644 src/transformers/models/pixtral/__init__.py create mode 100644 src/transformers/models/pixtral/configuration_pixtral.py create mode 100644 src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py create mode 100644 src/transformers/models/pixtral/image_processing_pixtral.py create mode 100644 src/transformers/models/pixtral/modeling_pixtral.py create mode 100644 src/transformers/models/pixtral/processing_pixtral.py create mode 100644 tests/models/pixtral/__init__.py create mode 100644 tests/models/pixtral/test_image_processing_pixtral.py create mode 100644 tests/models/pixtral/test_modeling_pixtral.py create mode 100644 tests/models/pixtral/test_processor_pixtral.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 1c7f62ec6ea7b8..235ea81a7f1ea6 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -862,6 +862,8 @@ title: Perceiver - local: model_doc/pix2struct title: Pix2Struct + - local: model_doc/pixtral + title: Pixtral - local: model_doc/sam title: Segment Anything - local: model_doc/siglip diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 8e3a4da8b021de..c18426de4c031c 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -253,6 +253,7 @@ Flax), PyTorch, and/or TensorFlow. | [Phi3](model_doc/phi3) | ✅ | ❌ | ❌ | | [PhoBERT](model_doc/phobert) | ✅ | ✅ | ✅ | | [Pix2Struct](model_doc/pix2struct) | ✅ | ❌ | ❌ | +| [Pixtral](model_doc/pixtral) | ❌ | ❌ | ❌ | | [PLBart](model_doc/plbart) | ✅ | ❌ | ❌ | | [PoolFormer](model_doc/poolformer) | ✅ | ❌ | ❌ | | [Pop2Piano](model_doc/pop2piano) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md new file mode 100644 index 00000000000000..8df2bf5af5f9ca --- /dev/null +++ b/docs/source/en/model_doc/pixtral.md @@ -0,0 +1,98 @@ + + +# Pixtral + +## Overview + +The Pixtral model was released by the Mistral AI team on [Vllm](https://github.com/vllm-project/vllm/pull/8377), where a version of the code can be found! + + +Tips: + +- Pixtral is a multimodal model, the main contribution is the 2d ROPE on the images, and support for arbitrary image size (the images are not padded together nor are they resized) +- This model follows the `Llava` familiy, meaning image embeddings are placed instead of the `[IMG]` token placeholders. +- The format for one or mulitple prompts is the following: +``` +"[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]" +``` +Then, the processor will replace each `[IMG]` token with a number of `[IMG]` token that depends on the height and the width of the image. Each *row* of the image is separated by a `[IMG_BREAK]` token, and each image is separated by a `[IMG_END]` token. + +This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [ArthurZ](https://huggingface.co/ArthurZ) + +Here is an example of how to run it: + +```python +from transformers import LlavaForConditionalGeneration, AutoProcessor +from PIL import Image + +model_id = "hf-internal-testing/pixtral-12b" +model = LlavaForConditionalGeneration.from_pretrained(model_id).to("cuda") +processor = AutoProcessor.from_pretrained(model_id) + +IMG_URLS = [ + "https://picsum.photos/id/237/400/300", + "https://picsum.photos/id/231/200/300", + "https://picsum.photos/id/27/500/500", + "https://picsum.photos/id/17/150/600", +] +PROMPT = "[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]" + +inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda") +generate_ids = model.generate(**inputs, max_new_tokens=500) +ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + +EXPECTED_GENERATION = """ +Describe the images. +Sure, let's break down each image description: + +1. **Image 1:** + - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera. + - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur. + +2. **Image 2:** + - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley. + - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image. + +3. **Image 3:** + - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset. + - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene. + +4. **Image 4:** + - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers. + - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden. + +Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it. +""" + +``` +## PixtralVisionConfig + +[[autodoc]] PixtralVisionConfig + +## PixtralModel + +[[autodoc]] PixtralModel + - forward + +## PixtralImageProcessor + +[[autodoc]] PixtralImageProcessor + - preprocess + +## PixtralProcessor + +[[autodoc]] PixtralProcessor diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 00cc67915f3664..36775d8454ab8c 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -649,6 +649,7 @@ "Pix2StructTextConfig", "Pix2StructVisionConfig", ], + "models.pixtral": ["PixtralProcessor", "PixtralVisionConfig"], "models.plbart": ["PLBartConfig"], "models.poolformer": ["PoolFormerConfig"], "models.pop2piano": ["Pop2PianoConfig"], @@ -1199,6 +1200,7 @@ _import_structure["models.owlvit"].extend(["OwlViTFeatureExtractor", "OwlViTImageProcessor"]) _import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"]) _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"]) + _import_structure["models.pixtral"].append("PixtralImageProcessor") _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"]) _import_structure["models.pvt"].extend(["PvtImageProcessor"]) _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"]) @@ -1359,7 +1361,6 @@ "AlignVisionModel", ] ) - _import_structure["models.altclip"].extend( [ "AltCLIPModel", @@ -2977,6 +2978,7 @@ "Pix2StructVisionModel", ] ) + _import_structure["models.pixtral"].extend(["PixtralModel", "PixtralPreTrainedModel"]) _import_structure["models.plbart"].extend( [ "PLBartForCausalLM", @@ -5434,6 +5436,10 @@ Pix2StructTextConfig, Pix2StructVisionConfig, ) + from .models.pixtral import ( + PixtralProcessor, + PixtralVisionConfig, + ) from .models.plbart import PLBartConfig from .models.poolformer import ( PoolFormerConfig, @@ -6009,6 +6015,7 @@ from .models.owlvit import OwlViTFeatureExtractor, OwlViTImageProcessor from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor from .models.pix2struct import Pix2StructImageProcessor + from .models.pixtral import PixtralImageProcessor from .models.poolformer import ( PoolFormerFeatureExtractor, PoolFormerImageProcessor, @@ -7448,6 +7455,10 @@ Pix2StructTextModel, Pix2StructVisionModel, ) + from .models.pixtral import ( + PixtralModel, + PixtralPreTrainedModel, + ) from .models.plbart import ( PLBartForCausalLM, PLBartForConditionalGeneration, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 26b96def67d992..2022048cd4553f 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -187,6 +187,7 @@ phi3, phobert, pix2struct, + pixtral, plbart, poolformer, pop2piano, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index fa1a7fb88eafa8..2cd7d550d90b7a 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -205,6 +205,7 @@ ("phi", "PhiConfig"), ("phi3", "Phi3Config"), ("pix2struct", "Pix2StructConfig"), + ("pixtral", "PixtralVisionConfig"), ("plbart", "PLBartConfig"), ("poolformer", "PoolFormerConfig"), ("pop2piano", "Pop2PianoConfig"), @@ -509,6 +510,7 @@ ("phi3", "Phi3"), ("phobert", "PhoBERT"), ("pix2struct", "Pix2Struct"), + ("pixtral", "Pixtral"), ("plbart", "PLBart"), ("poolformer", "PoolFormer"), ("pop2piano", "Pop2Piano"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index c83c43518a6a31..95d9ddef8f7979 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -114,6 +114,7 @@ ("owlvit", ("OwlViTImageProcessor",)), ("perceiver", ("PerceiverImageProcessor",)), ("pix2struct", ("Pix2StructImageProcessor",)), + ("pixtral", ("PixtralImageProcessor",)), ("poolformer", ("PoolFormerImageProcessor",)), ("pvt", ("PvtImageProcessor",)), ("pvt_v2", ("PvtImageProcessor",)), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 45a9c4d0d078b7..e0d15f1e236590 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -193,6 +193,7 @@ ("persimmon", "PersimmonModel"), ("phi", "PhiModel"), ("phi3", "Phi3Model"), + ("pixtral", "PixtralModel"), ("plbart", "PLBartModel"), ("poolformer", "PoolFormerModel"), ("prophetnet", "ProphetNetModel"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 7f49e0e8d99730..82d325248eabfb 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -82,6 +82,7 @@ ("owlvit", "OwlViTProcessor"), ("paligemma", "PaliGemmaProcessor"), ("pix2struct", "Pix2StructProcessor"), + ("pixtral", "PixtralProcessor"), ("pop2piano", "Pop2PianoProcessor"), ("qwen2_audio", "Qwen2AudioProcessor"), ("qwen2_vl", "Qwen2VLProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index c8eb06db04a098..e735579108d857 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -385,6 +385,7 @@ ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("phobert", ("PhobertTokenizer", None)), ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), + ("pixtral", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)), ("prophetnet", ("ProphetNetTokenizer", None)), ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index f2338a7c5a5df7..3a4cb09855f0ec 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -73,7 +73,7 @@ class LlavaConfig(PretrainedConfig): ```""" model_type = "llava" - is_composition = False + is_composition = True def __init__( self, diff --git a/src/transformers/models/pixtral/__init__.py b/src/transformers/models/pixtral/__init__.py new file mode 100644 index 00000000000000..e09ed8e60127dd --- /dev/null +++ b/src/transformers/models/pixtral/__init__.py @@ -0,0 +1,70 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_pixtral": ["PixtralVisionConfig"], + "processing_pixtral": ["PixtralProcessor"], +} + + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_pixtral"] = [ + "PixtralModel", + "PixtralPreTrainedModel", + ] + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_pixtral"] = ["PixtralImageProcessor"] + + +if TYPE_CHECKING: + from .configuration_pixtral import PixtralProcessor, PixtralVisionConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_pixtral import ( + PixtralModel, + PixtralPreTrainedModel, + ) + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_pixtral import PixtralImageProcessor + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py new file mode 100644 index 00000000000000..dcc1e458ca78a3 --- /dev/null +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -0,0 +1,103 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Pixtral model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class PixtralVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`PixtralModel`]. It is used to instantiate an + Pixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Pixtral-9B. + + e.g. [pixtral-hf/pixtral-9b](https://huggingface.co/pixtral-hf/pixtral-9b) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1024): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 4096): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of input channels in the input images. + image_size (`int`, *optional*, defaults to 1024): + Max dimension of the input images. + patch_size (`int`, *optional*, defaults to 16): + Size of the image patches. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + Activation function used in the hidden layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + Dropout probability for the attention layers. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie the word embeddings with the input embeddings. + + Example: + + ```python + >>> from transformers import PixtralModel, PixtralVisionConfig, CLIPVisionConfig, LlamaConfig + + >>> # Initializing a Pixtral 12B style configuration + >>> config = PixtralVisionConfig() + + >>> # Initializing a model from the pixtral 12B style configuration + >>> model = PixtralModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "pixtral" + + def __init__( + self, + hidden_size=1024, + intermediate_size=4096, + num_hidden_layers=24, + num_attention_heads=16, + num_channels=3, + image_size=1024, + patch_size=16, + hidden_act="gelu", + attention_dropout=0.0, + rope_theta=10000.0, + tie_word_embeddings=False, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.hidden_act = hidden_act + self.rope_theta = rope_theta + self.tie_word_embeddings = tie_word_embeddings + self.head_dim = hidden_size // num_attention_heads diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py new file mode 100644 index 00000000000000..c4190082d99471 --- /dev/null +++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py @@ -0,0 +1,285 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +import regex as re +import torch +from mistral_common.tokens.tokenizers.mistral import MistralTokenizer +from safetensors.torch import load_file as safe_load_file +from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors +from tokenizers.models import BPE + +from transformers import ( + LlavaConfig, + LlavaForConditionalGeneration, + MistralConfig, + PixtralImageProcessor, + PixtralProcessor, + PixtralVisionConfig, + PreTrainedTokenizerFast, +) +from transformers.convert_slow_tokenizer import bytes_to_unicode + + +""" +# Here is how to get the original tokens! +model_name = "mistralai/Pixtral-12B-2409" +tok = MistralTokenizer.from_model(model_name) + +from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk + +EXPECTED_TOKENS = tok.encode_chat_completion( + ChatCompletionRequest( + messages=[ + UserMessage( + content=[ + TextChunk(text="Describe the images"), + ] + [ImageChunk(image=img) for img in IMG_URLS] + ) + ], + model="pixtral", + ) +) +assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS +""" + +OLD_KEY_TO_NEW_KEY_MAPPING = { + # Layer Normalization Weights + r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight": r"vision_tower.transformer.layers.\1.attention_norm.weight", + r"vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"vision_tower.transformer.layers.\1.ffn_norm.weight", + # Self Attention Projections + r"vision_encoder.transformer.layers.(\d+).attention.wq.weight": r"vision_tower.transformer.layers.\1.attention.q_proj.weight", + r"vision_encoder.transformer.layers.(\d+).attention.wk.weight": r"vision_tower.transformer.layers.\1.attention.k_proj.weight", + r"vision_encoder.transformer.layers.(\d+).attention.wv.weight": r"vision_tower.transformer.layers.\1.attention.v_proj.weight", + r"vision_encoder.transformer.layers.(\d+).attention.wo.weight": r"vision_tower.transformer.layers.\1.attention.o_proj.weight", + # MLP Projections + r"vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight", + r"vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.feed_forward.down_proj.weight", + r"vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.feed_forward.up_proj.weight", + # Additional mappings + r"vision_encoder": r"vision_tower", + r"vision_language_adapter.w_in": r"multi_modal_projector.linear_1", + r"vision_language_adapter.w_out": r"multi_modal_projector.linear_2", + r"layers.(\d+).attention.wq.weight": r"language_model.model.layers.\1.self_attn.q_proj.weight", + r"layers.(\d+).attention.wk.weight": r"language_model.model.layers.\1.self_attn.k_proj.weight", + r"layers.(\d+).attention.wv.weight": r"language_model.model.layers.\1.self_attn.v_proj.weight", + r"layers.(\d+).attention.wo.weight": r"language_model.model.layers.\1.self_attn.o_proj.weight", + r"layers.(\d+).feed_forward.w1.weight": r"language_model.model.layers.\1.mlp.gate_proj.weight", + r"layers.(\d+).feed_forward.w2.weight": r"language_model.model.layers.\1.mlp.down_proj.weight", + r"layers.(\d+).feed_forward.w3.weight": r"language_model.model.layers.\1.mlp.up_proj.weight", + r"layers.(\d+).ffn_norm.weight": r"language_model.model.layers.\1.post_attention_layernorm.weight", + r"layers.(\d+).attention_norm.weight": r"language_model.model.layers.\1.input_layernorm.weight", + r"tok_embeddings.weight": r"language_model.model.embed_tokens.weight", + r"output.weight": r"language_model.lm_head.weight", + r"norm.weight": r"language_model.model.norm.weight", +} + + +class MistralConverter: + """ + A general tiktoken converter. + """ + + def __init__( + self, + vocab=None, + pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", + add_prefix_space=False, + additional_special_tokens=None, + *args, + **kwargs, + ): + super().__init__(*args) + self.vocab = vocab + self.pattern = pattern + self.add_prefix_space = add_prefix_space + self.additional_special_tokens = additional_special_tokens + + def extract_vocab_merges_from_model(self, vocab: str): + bpe_ranks = vocab + byte_encoder = bytes_to_unicode() + + def token_bytes_to_string(b): + return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) + + merges = [] + vocab = {} + for idx, (token, rank) in enumerate(bpe_ranks.items()): + if token not in self.additional_special_tokens: + vocab[token_bytes_to_string(token)] = idx + if len(token) == 1: + continue + local = [] + for index in range(1, len(token)): + piece_l, piece_r = token[:index], token[index:] + if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks: + local.append((piece_l, piece_r, rank)) + local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False) + merges.extend(local) + else: + vocab[token] = idx + merges = sorted(merges, key=lambda val: val[2], reverse=False) + merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges] + return vocab, merges + + def tokenizer(self): + vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab) + tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False)) + if hasattr(tokenizer.model, "ignore_merges"): + tokenizer.model.ignore_merges = True + return tokenizer + + def converted(self) -> Tokenizer: + tokenizer = self.tokenizer() + tokenizer.pre_tokenizer = pre_tokenizers.Sequence( + [ + pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False), + pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False), + ] + ) + tokenizer.decoder = decoders.ByteLevel() + tokenizer.add_special_tokens(self.additional_special_tokens) + + tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) + + return tokenizer + + +def convert_mistral_tokenizer(): + model_name = "mistralai/Pixtral-12B-2409" + + tokenizer = MistralTokenizer.from_model(model_name) + + vocab = tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial + all_special = [ + token.value if hasattr(token, "value") else token + for token in tokenizer.instruct_tokenizer.tokenizer._all_special_tokens + ] + specials_tokens = {token: all_special.index(token) for token in all_special} + specials_tokens.update(vocab) + vocab = specials_tokens + + tokenizer = PreTrainedTokenizerFast( + tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(), + bos_token="", + unk_token="", + eos_token="", + ) + tokenizer.model_input_names = ["input_ids", "attention_mask"] + + return tokenizer + + +def permute_for_rope(value, n_heads, config): + dim1 = value.shape[0] + dim2 = config.hidden_size + return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) + + +def convert_dictionnary(original_state_dict, vision_config, text_config): + new_dict = {} + + all_keys = "\n" + "\n".join(original_state_dict.keys()) + old_keys = all_keys + for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items(): + all_keys = re.sub(r"\n" + old, r"\n" + new, all_keys) + + OLD_TO_NEW = dict(zip(old_keys.split("\n"), all_keys.split("\n"))) + + for key, value in original_state_dict.items(): + new_key = OLD_TO_NEW[key] + if "vision_encoder" in key: + _config = vision_config + num_attention_heads = _config.num_attention_heads + else: + _config = text_config + if "q_proj" in new_key: + num_attention_heads = _config.num_attention_heads + if "k_proj" in new_key: + num_attention_heads = _config.num_key_value_heads + # convert the text model (basically mistral model) + + if "q_proj" in new_key or "k_proj" in new_key: + value = permute_for_rope(value, num_attention_heads, _config) + + new_dict[new_key] = value + return new_dict + + +def convert_mistral_model(input_dir, output_dir): + text_config = MistralConfig( + attention_dropout=0.0, + bos_token_id=1, + eos_token_id=2, + head_dim=128, + hidden_act="silu", + hidden_size=5120, + initializer_range=0.02, + intermediate_size=14336, + max_position_embeddings=1024000, + model_type="mistral", + num_attention_heads=32, + num_hidden_layers=40, + num_key_value_heads=8, + rms_norm_eps=1e-05, + rope_theta=1000000000.0, + sliding_window=None, + tie_word_embeddings=False, + vocab_size=131072, + ) + + vision_config = PixtralVisionConfig() + config = LlavaConfig( + vision_config, + text_config, + vision_feature_layer=-1, + image_token_index=10, + vision_feature_select_strategy="full", + image_seq_length=1, + ) + config.architectures = ["LlavaForConditionalGeneration"] + config.save_pretrained(output_dir) + + original_state_dict = safe_load_file(f"{input_dir}/consolidated.safetensors") + new_dict = convert_dictionnary(original_state_dict, vision_config, text_config) + + with torch.device("meta"): + model = LlavaForConditionalGeneration(config) + model.load_state_dict(new_dict, strict=True, assign=True) + + model.save_pretrained(output_dir) + + tokenizer = convert_mistral_tokenizer() + image_processor = PixtralImageProcessor() + processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]") + processor.save_pretrained(output_dir) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_dir", + help="Location of LLaMA weights, which contains tokenizer.model and model folders", + ) + parser.add_argument( + "--output_dir", + help="Location to write HF model and tokenizer", + ) + + args = parser.parse_args() + convert_mistral_model(args.input_dir, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py new file mode 100644 index 00000000000000..c6d18420bec575 --- /dev/null +++ b/src/transformers/models/pixtral/image_processing_pixtral.py @@ -0,0 +1,519 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Pixtral.""" + +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + is_valid_image, + to_numpy_array, + valid_images, + validate_kwargs, + validate_preprocess_arguments, +) +from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, is_vision_available, logging +from ...utils.import_utils import requires_backends + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + import PIL + + +class BatchMixFeature(BatchFeature): + def to(self, *args, **kwargs) -> "BatchMixFeature": + """ + Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in + different `dtypes` and sending the `BatchFeature` to a different `device`. + + Args: + args (`Tuple`): + Will be passed to the `to(...)` function of the tensors. + kwargs (`Dict`, *optional*): + Will be passed to the `to(...)` function of the tensors. + + Returns: + [`BatchFeature`]: The same instance after modification. + """ + requires_backends(self, ["torch"]) + import torch # noqa + + new_data = {} + device = kwargs.get("device") + # Check if the args are a device or a dtype + if device is None and len(args) > 0: + # device should be always the first argument + arg = args[0] + if is_torch_dtype(arg): + # The first argument is a dtype + pass + elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int): + device = arg + else: + # it's something else + raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.") + # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor` + for k, v in self.items(): + # check if v is a floating point + if isinstance(v, list): + new_data[k] = [ + element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element) + ] + elif torch.is_floating_point(v): + # cast and send to device + new_data[k] = v.to(*args, **kwargs) + elif device is not None: + new_data[k] = v.to(device=device) + else: + new_data[k] = v + self.data = new_data + return self + + +# Copied from transformers.models.idefics2.image_processing_idefics2.make_list_of_images +def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: + """ + Convert a single image or a list of images to a list of numpy arrays. + + Args: + images (`ImageInput`): + A single image or a list of images. + + Returns: + A list of numpy arrays. + """ + # If it's a single image, convert it to a list of lists + if is_valid_image(images): + images = [[images]] + # If it's a list of images, it's a single batch, so convert it to a list of lists + elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]): + images = [images] + # If it's a list of batches, it's already in the right format + elif ( + isinstance(images, (list, tuple)) + and len(images) > 0 + and isinstance(images[0], (list, tuple)) + and is_valid_image(images[0][0]) + ): + pass + else: + raise ValueError( + "Invalid input type. Must be a single image, a list of images, or a list of batches of images." + ) + return images + + +# Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white. +def convert_to_rgb(image: ImageInput) -> ImageInput: + """ + Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image + as is. + Args: + image (Image): + The image to convert. + """ + requires_backends(convert_to_rgb, ["vision"]) + + if not isinstance(image, PIL.Image.Image): + return image + + if image.mode == "RGB": + return image + + # First we convert to RGBA to set background to white. + image = image.convert("RGBA") + + # Create a new image with a white background. + new_image = PIL.Image.new("RGBA", image.size, "WHITE") + new_image.paste(image, (0, 0), image) + new_image = new_image.convert("RGB") + return new_image + + +def _num_image_tokens(image_size: Tuple[int, int], patch_size: Tuple[int, int]) -> int: + """ + Calculate the number of image tokens given the image size and patch size. + + Args: + image_size (`Tuple[int, int]`): + The size of the image as `(height, width)`. + patch_size (`Tuple[int, int]`): + The patch size as `(height, width)`. + + Returns: + `int`: The number of image tokens. + """ + height, width = image_size + patch_height, patch_width = patch_size if isinstance(patch_size, (tuple, list)) else (patch_size, patch_size) + num_width_tokens = (width - 1) // patch_width + 1 + num_height_tokens = (height - 1) // patch_height + 1 + return num_height_tokens, num_width_tokens + + +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, Tuple[int, int], List[int], Tuple[int]], + patch_size: Union[int, Tuple[int, int], List[int], Tuple[int]], + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> tuple: + """ + Find the target (height, width) dimension of the output image after resizing given the input image and the desired + size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `Tuple[int, int]`): + Max image size an input image can be. Must be a dictionary with the key "longest_edge". + patch_size (`int` or `Tuple[int, int]`): + The patch_size as `(height, width)` to use for resizing the image. If patch_size is an integer, `(patch_size, patch_size)` + will be used + input_data_format (`ChannelDimension`, *optional*): + The channel dimension format of the input image. If unset, will use the inferred format from the input. + + Returns: + `tuple`: The target (height, width) dimension of the output image after resizing. + """ + max_height, max_width = size if isinstance(size, (tuple, list)) else (size, size) + patch_height, patch_width = patch_size if isinstance(patch_size, (tuple, list)) else (patch_size, patch_size) + height, width = get_image_size(input_image, input_data_format) + + ratio = max(height / max_height, width / max_width) + + if ratio > 1: + # Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results + height = int(np.ceil(height / ratio)) + width = int(np.ceil(width / ratio)) + + num_height_tokens, num_width_tokens = _num_image_tokens((height, width), (patch_height, patch_width)) + return num_height_tokens * patch_height, num_width_tokens * patch_width + + +# Hack to get tensor conversion used in BatchFeature without batching the images +def _get_is_as_tensor_fns(tensor_type: Union[str, TensorType]) -> Tuple[Callable, Callable]: + return BatchFeature()._get_is_as_tensor_fns(tensor_type) + + +def convert_to_tensor(array, tensor_type: Union[str, TensorType]) -> Any: + is_tensor, as_tensor = _get_is_as_tensor_fns(tensor_type) + if is_tensor(array): + return array + return as_tensor(array) + + +class PixtralImageProcessor(BaseImageProcessor): + r""" + Constructs a Pixtral image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"longest_edge": 1024}`): + Size of the maximum dimension of either the height or width dimension of the image. Used to control how + images are resized. If either the height or width are greater than `size["longest_edge"]` then both the height and width are rescaled by `height / ratio`, `width /ratio` where `ratio = max(height / longest_edge, width / longest_edge)` + patch_size (`Dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`): + Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + patch_size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"longest_edge": 1024} + patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16} + patch_size = get_size_dict(patch_size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.patch_size = patch_size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073] + self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711] + self.do_convert_rgb = do_convert_rgb + self._valid_processor_keys = [ + "images", + "do_resize", + "size", + "patch_size", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_convert_rgb", + "return_tensors", + "data_format", + "input_data_format", + ] + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + patch_size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dict containing the longest possible edge of the image. + patch_size (`Dict[str, int]`): + Patch size used to calculate the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + if "longest_edge" in size: + size = (size["longest_edge"], size["longest_edge"]) + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError("size must contain either 'longest_edge' or 'height' and 'width'.") + + if "height" in patch_size and "width" in patch_size: + patch_size = (patch_size["height"], patch_size["width"]) + else: + raise ValueError("patch_size must contain either 'shortest_edge' or 'height' and 'width'.") + + output_size = get_resize_output_image_size( + image, + size=size, + patch_size=patch_size, + input_data_format=input_data_format, + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def preprocess( + self, + images: ImageInput, + do_resize: bool = None, + size: Dict[str, int] = None, + patch_size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_rescale: bool = None, + rescale_factor: float = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Describes the maximum input dimensions to the model. + patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`): + Patch size in the model. Used to calculate the image after resizing. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + patch_size = patch_size if patch_size is not None else self.patch_size + patch_size = get_size_dict(patch_size, default_to_square=True) + + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + images_list = make_list_of_images(images) + + if not valid_images(images_list[0]): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images_list = [[convert_to_rgb(image) for image in images] for images in images_list] + + # All transformations expect numpy arrays. + images_list = [[to_numpy_array(image) for image in images] for images in images_list] + + if is_scaled_image(images_list[0][0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images_list[0][0]) + + batch_images = [] + batch_image_sizes = [] + for sample_images in images_list: + images = [] + image_sizes = [] + for image in sample_images: + if do_resize: + image = self.resize( + image=image, + size=size, + patch_size=patch_size, + resample=resample, + input_data_format=input_data_format, + ) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + images.append(image) + image_sizes.append(get_image_size(image, input_data_format)) + batch_images.append(images) + batch_image_sizes.append(image_sizes) + + images_list = [ + [to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images] + for images in batch_images + ] + + # Convert to tensor type outside of BatchFeature to avoid batching the images of different sizes + images_list = [[convert_to_tensor(image, return_tensors) for image in images] for images in images_list] + return BatchMixFeature(data={"pixel_values": images_list, "image_sizes": batch_image_sizes}, tensor_type=None) diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py new file mode 100644 index 00000000000000..0e10c78b7852af --- /dev/null +++ b/src/transformers/models/pixtral/modeling_pixtral.py @@ -0,0 +1,517 @@ +# coding=utf-8 +# Copyright 2024 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Pixtral model.""" + +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from ... import PreTrainedModel +from ...activations import ACT2FN +from ...modeling_outputs import BaseModelOutput +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_pixtral import PixtralVisionConfig + + +logger = logging.get_logger(__name__) + + +def position_ids_in_meshgrid(patch_embeds_list, max_width): + positions = [] + for patch in patch_embeds_list: + height, width = patch.shape[-2:] + mesh = torch.meshgrid(torch.arange(height), torch.arange(width), indexing="ij") + h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2, -1) + ids = h_grid * max_width + v_grid + positions.append(ids[:, 0]) + return torch.cat(positions) + + +class PixtralRotaryEmbedding(nn.Module): + """ + The key with pixtral embedding is just that you have a frequency for each pixel positions. + If you have height x width pixels (or embedding pixels) + + then the frequency used for ROPE is given by indexing the pre_computed frequency on the + width and height. + + What you output is of dimension batch, height * width, dim with dim the embed dim. + + This simply means that for each image hidden states, you are going to add + a corresponding positional embedding, based on it's index in the grid. + """ + + def __init__(self, config, device): + super().__init__() + self.rope_type = "default" + self.dim = config.head_dim + self.base = config.rope_theta + max_patches_per_side = config.image_size // config.patch_size + freqs = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim)) + + h = torch.arange(max_patches_per_side, device=freqs.device) + w = torch.arange(max_patches_per_side, device=freqs.device) + + freqs_h = torch.outer(h, freqs[::2]).float() + freqs_w = torch.outer(w, freqs[1::2]).float() + inv_freq = torch.cat( + [ + freqs_h[:, None, :].repeat(1, max_patches_per_side, 1), + freqs_w[None, :, :].repeat(max_patches_per_side, 1, 1), + ], + dim=-1, + ).reshape(-1, self.dim // 2) # we reshape to only index on the position indexes, not tuple of indexes + # Different from paper, but it uses a different permutation in order to obtain the same calculation + + # TODO maybe make it torch compatible later on. We can also just slice + self.register_buffer("inv_freq", torch.cat((inv_freq, inv_freq), dim=-1), persistent=False) + + @torch.no_grad() + def forward(self, x, position_ids): + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids, device=x.device) + + # Core RoPE block + freqs = self.inv_freq[position_ids] + # position_ids_expanded = position_ids[:, None, :].float() + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + emb = freqs + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def _dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn( + self.config, device, seq_len=seq_len, **self.rope_kwargs + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class PixtralAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False) + self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, patches, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=0) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale + + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(batch_size, patches, -1) + + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights + + +# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Pixtral +class PixtralMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_state): + return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state)) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Pixtral +class PixtralRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + PixtralRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class PixtralAttentionLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.attention_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5) + self.feed_forward = PixtralMLP(config) + self.attention = PixtralAttention(config) + self.ffn_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(batch, seq_len, embed_dim)`. + attention_mask (`torch.FloatTensor`): + Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.attention_norm(hidden_states) + hidden_states, attn_weights = self.attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.ffn_norm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + return outputs + + +class PixtralTransformer(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layers = torch.nn.ModuleList() + for _ in range(config.num_hidden_layers): + self.layers.append(PixtralAttentionLayer(config)) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + position_embeddings: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + position_embeddings, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + position_embeddings=position_embeddings, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=[hidden_states], attentions=all_attentions + ) + + +PIXTRAL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`PixtralVisionConfig`] or [`PixtralVisionConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + PIXTRAL_START_DOCSTRING, +) +class PixtralPreTrainedModel(PreTrainedModel): + config_class = PixtralVisionConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["PixtralVisionAttention"] + _skip_keys_device_placement = "past_key_values" + _supports_cache_class = True + + def _init_weights(self, module): + # important: this ported version of Pixtral isn't meant for training from scratch - only + # inference and fine-tuning - so the proper init weights code has been removed - the original codebase + # https://github.com/haotian-liu/LLaVA/tree/main/pixtral should serve for that purpose + std = ( + self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.text_config.initializer_range + ) + + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +PIXTRAL_INPUTS_DOCSTRING = r""" + Args: + pixel_values: list of N_img images of variable sizes, + each of shape (C, H, W) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +def generate_block_attention_mask(patch_embeds_list, tensor): + dtype = tensor.dtype + device = tensor.device + seq_len = tensor.shape[1] + d_min = torch.finfo(dtype).min + causal_mask = torch.full((seq_len, seq_len), fill_value=d_min, dtype=dtype, device=device) + + block_end_idx = torch.tensor(patch_embeds_list).cumsum(-1) + block_start_idx = torch.tensor([0] + patch_embeds_list[:-1]).cumsum(-1) + for start, end in zip(block_start_idx, block_end_idx): + causal_mask[start:end, start:end] = 0 + + causal_mask = causal_mask[None, None, :, :].expand(tensor.shape[0], 1, -1, -1) + return causal_mask + + +@add_start_docstrings( + """The PIXTRAL model which consists of a vision backbone and a language model.""", + PIXTRAL_START_DOCSTRING, +) +class PixtralModel(PixtralPreTrainedModel): + base_model_prefix = "vision_encoder" + + def __init__(self, config): + super().__init__(config) + self.config = config + self.patch_conv = nn.Conv2d( + in_channels=config.num_channels, + out_channels=config.hidden_size, + kernel_size=config.patch_size, + stride=config.patch_size, + bias=False, + ) + self.ln_pre = PixtralRMSNorm(config.hidden_size, eps=1e-5) + self.transformer = PixtralTransformer(config) + self.patch_positional_embedding = PixtralRotaryEmbedding(config, device=self.device) + + @add_start_docstrings_to_model_forward(PIXTRAL_INPUTS_DOCSTRING) + def forward( + self, + pixel_values: List[torch.Tensor], + output_hidden_states: Optional[bool] = False, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + *args, + **kwargs, + ) -> Union[Tuple, BaseModelOutput]: + """ + Returns: + pixel_values: tensor of token features for + all tokens of all images of shape (N_toks, D) + """ + # pass images through initial convolution independently + patch_embeds_list = [self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in pixel_values] + + # flatten to a single sequence + patch_embeds = torch.cat([p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1) + patch_embeds = self.ln_pre(patch_embeds) + + # positional embeddings + position_ids = position_ids_in_meshgrid( + patch_embeds_list, max_width=self.config.image_size // self.config.patch_size + ).to(self.device) + + position_embedding = self.patch_positional_embedding(patch_embeds, position_ids) + attention_mask = generate_block_attention_mask( + [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds + ) + return self.transformer(patch_embeds, attention_mask, position_embedding) diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py new file mode 100644 index 00000000000000..9362703c8aa6da --- /dev/null +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -0,0 +1,282 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Pixtral. +""" + +from typing import List, Optional, Union + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, is_valid_image, load_image +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, logging, requires_backends + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.idefics2.processing_idefics2.is_url +def is_url(val) -> bool: + return isinstance(val, str) and val.startswith("http") + + +# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url +def is_image_or_image_url(elem): + return is_url(elem) or is_valid_image(elem) + + +# Copied from transformers.models.pixtral.image_processing_pixtral.BatchMixFeature +class BatchMixFeature(BatchFeature): + def to(self, *args, **kwargs) -> "BatchMixFeature": + """ + Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in + different `dtypes` and sending the `BatchFeature` to a different `device`. + + Args: + args (`Tuple`): + Will be passed to the `to(...)` function of the tensors. + kwargs (`Dict`, *optional*): + Will be passed to the `to(...)` function of the tensors. + + Returns: + [`BatchFeature`]: The same instance after modification. + """ + requires_backends(self, ["torch"]) + import torch # noqa + + new_data = {} + device = kwargs.get("device") + # Check if the args are a device or a dtype + if device is None and len(args) > 0: + # device should be always the first argument + arg = args[0] + if is_torch_dtype(arg): + # The first argument is a dtype + pass + elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int): + device = arg + else: + # it's something else + raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.") + # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor` + for k, v in self.items(): + # check if v is a floating point + if isinstance(v, list): + new_data[k] = [ + element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element) + ] + elif torch.is_floating_point(v): + # cast and send to device + new_data[k] = v.to(*args, **kwargs) + elif device is not None: + new_data[k] = v.to(device=device) + else: + new_data[k] = v + self.data = new_data + return self + + +class PixtralProcessor(ProcessorMixin): + r""" + Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor. + + [`PixtralProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`~PixtralProcessor.__call__`] and [`~PixtralProcessor.decode`] for more information. + + Args: + image_processor ([`PixtralImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`LlamaTokenizerFast`], *optional*): + The tokenizer is a required input. + patch_size (`int`, *optional*, defaults to 16): + Patch size from the vision tower. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + image_token (`str`, *optional*, defaults to `"[IMG]"`): + Special token used to denote image location. + image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`): + Special token used to denote the end of a line of pixels in an image. + image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`): + Special token used to denote the end of an image input. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = [ + "chat_template", + "patch_size", + "image_token", + "image_break_token", + "image_end_token", + ] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer=None, + patch_size: int = 16, + chat_template=None, + image_token="[IMG]", # set the default and let users change if they have peculiar special tokens in rare cases + image_break_token="[IMG_BREAK]", + image_end_token="[IMG_END]", + **kwargs, + ): + self.patch_size = patch_size + self.image_token = image_token + self.image_break_token = image_break_token + self.image_end_token = image_end_token + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + images: ImageInput = None, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length=None, + return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, + ) -> BatchMixFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to + CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + truncation (`bool`, *optional*): + Activates truncation to cut input sequences longer than `max_length` to `max_length`. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if images is not None: + if is_image_or_image_url(images): + images = [[images]] + elif isinstance(images, list) and is_image_or_image_url(images[0]): + images = [images] + elif ( + not isinstance(images, list) + and not isinstance(images[0], list) + and not is_image_or_image_url(images[0][0]) + ): + raise ValueError( + "Invalid input images. Please provide a single image or a list of images or a list of list of images." + ) + images = [[load_image(im) for im in sample] for sample in images] + image_inputs = self.image_processor(images, patch_size=self.patch_size, return_tensors=return_tensors) + else: + image_inputs = {} + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + # try to expand inputs in processing if we have the necessary parts + prompt_strings = text + if image_inputs.get("pixel_values") is not None: + # Replace the image token with the expanded image token sequence + images = image_inputs["pixel_values"] + image_sizes = image_inputs.pop("image_sizes") + prompt_strings = [] + + for sample_images, sample_image_sizes, sample in zip(images, image_sizes, text): + replace_strings = [] + # First calculate the number of tokens needed for each image and put in a placeholder + for image, image_size in zip(sample_images, sample_image_sizes): + height, width = image_size + num_height_tokens = height // self.patch_size + num_width_tokens = width // self.patch_size + replace_tokens = [ + [self.image_token] * num_width_tokens + [self.image_break_token] + ] * num_height_tokens + # Flatten list + replace_tokens = [item for sublist in replace_tokens for item in sublist] + replace_tokens[-1] = self.image_end_token + replace_str = "".join(replace_tokens) + replace_strings.append(replace_str) + sample = sample.replace(self.image_token, "", 1) + + while "" in sample: + replace_str = replace_strings.pop(0) + sample = sample.replace("", replace_str, 1) + + prompt_strings.append(sample) + + text_inputs = self.tokenizer( + prompt_strings, + return_tensors=return_tensors, + padding=padding, + truncation=truncation, + max_length=max_length, + ) + return BatchMixFeature(data={**text_inputs, **image_inputs}) + + # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index b9ce0d0f15bbf5..2db7b38b580375 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -7067,6 +7067,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class PixtralModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PixtralPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class PLBartForCausalLM(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 2493954a518b2c..436378582e54ca 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -506,6 +506,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class PixtralImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class PoolFormerFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 2fed802b5a2fb3..5c05480ffa6dbb 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -569,3 +569,50 @@ def test_expansion_in_processing(self): # check that both inputs are handled correctly and generate the same output self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist()) + + @slow + @require_bitsandbytes + def test_pixtral(self): + model_id = "hf-internal-testing/pixtral-12b" + model = LlavaForConditionalGeneration.from_pretrained(model_id) + processor = AutoProcessor.from_pretrained(model_id) + + IMG_URLS = [ + Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw), + ] + PROMPT = "[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]" + + # image = Image.open(requests.get(url, stream=True).raw) + inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda") + generate_ids = model.generate(**inputs, max_new_tokens=500) + ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + + # fmt: off + EXPECTED_GENERATION = """ +Describe the images. +Sure, let's break down each image description: + +1. **Image 1:** + - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera. + - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur. + +2. **Image 2:** + - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley. + - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image. + +3. **Image 3:** + - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset. + - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene. + +4. **Image 4:** + - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers. + - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden. + +Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it. +""" + # fmt: on + # check that both inputs are handled correctly and generate the same output + self.assertListEqual(ouptut, EXPECTED_GENERATION) diff --git a/tests/models/pixtral/__init__.py b/tests/models/pixtral/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/pixtral/test_image_processing_pixtral.py b/tests/models/pixtral/test_image_processing_pixtral.py new file mode 100644 index 00000000000000..3994201c065c45 --- /dev/null +++ b/tests/models/pixtral/test_image_processing_pixtral.py @@ -0,0 +1,217 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + +if is_vision_available(): + from PIL import Image + + from transformers import PixtralImageProcessor + + +class PixtralImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + max_num_images_per_sample=3, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + patch_size=None, + do_normalize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + do_convert_rgb=True, + ): + size = size if size is not None else {"longest_edge": 24} + patch_size = patch_size if patch_size is not None else {"height": 8, "width": 8} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.max_num_images_per_sample = max_num_images_per_sample + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.patch_size = patch_size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "patch_size": self.patch_size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_convert_rgb": self.do_convert_rgb, + } + + def expected_output_image_shape(self, image): + if isinstance(image, Image.Image): + width, height = image.size + elif isinstance(image, np.ndarray): + height, width = image.shape[:2] + elif isinstance(image, torch.Tensor): + height, width = image.shape[-2:] + + max_height = max_width = self.size.get("longest_edge") + + ratio = max(height / max_height, width / max_width) + if ratio > 1: + height = int(np.ceil(height / ratio)) + width = int(np.ceil(width / ratio)) + + patch_height, patch_width = self.patch_size["height"], self.patch_size["width"] + num_height_tokens = (height - 1) // patch_height + 1 + num_width_tokens = (width - 1) // patch_width + 1 + + height = num_height_tokens * patch_height + width = num_width_tokens * patch_width + + return self.num_channels, height, width + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + # Use prepare_image_inputs to make a list of list of single images + + images_list = [] + for _ in range(self.batch_size): + images = [] + for _ in range(random.randint(1, self.max_num_images_per_sample)): + img = prepare_image_inputs( + batch_size=1, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + )[0] + images.append(img) + images_list.append(images) + return images_list + + +@require_torch +@require_vision +class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = PixtralImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = PixtralImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "patch_size")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + + def test_call_pil(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs_list = self.image_processor_tester.prepare_image_inputs() + for image_inputs in image_inputs_list: + for image in image_inputs: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0]) + self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape) + + # Test batched + batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values + for encoded_images, images in zip(batch_encoded_images, image_inputs_list): + for encoded_image, image in zip(encoded_images, images): + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image) + self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape) + + def test_call_numpy(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs_list = self.image_processor_tester.prepare_image_inputs(numpify=True) + for image_inputs in image_inputs_list: + for image in image_inputs: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0]) + self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape) + + # Test batched + batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values + for encoded_images, images in zip(batch_encoded_images, image_inputs_list): + for encoded_image, image in zip(encoded_images, images): + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image) + self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape) + + def test_call_pytorch(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs_list = self.image_processor_tester.prepare_image_inputs(torchify=True) + for image_inputs in image_inputs_list: + for image in image_inputs: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0]) + self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape) + + # Test batched + batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values + for encoded_images, images in zip(batch_encoded_images, image_inputs_list): + for encoded_image, image in zip(encoded_images, images): + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image) + self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape) + + @unittest.skip(reason="PixtralImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy + def test_call_numpy_4_channels(self): + pass diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py new file mode 100644 index 00000000000000..bd41fa1c9e62fb --- /dev/null +++ b/tests/models/pixtral/test_modeling_pixtral.py @@ -0,0 +1,292 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Pixtral model.""" + +import gc +import unittest + +import requests + +from transformers import ( + AutoProcessor, + PixtralModel, + PixtralVisionConfig, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import ( + require_bitsandbytes, + require_torch, + slow, + torch_device, +) + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor + + +if is_torch_available(): + import torch +else: + is_torch_greater_or_equal_than_2_0 = False + +if is_vision_available(): + from PIL import Image + + +class PixtralModelTester: + def __init__( + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=32, + projection_dim=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.is_training = is_training + self.hidden_size = hidden_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.dropout = dropout + self.attention_dropout = attention_dropout + self.initializer_range = initializer_range + self.scope = scope + + # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) + num_patches = (image_size // patch_size) ** 2 + self.seq_length = num_patches + 1 + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + config = self.get_config() + + return config, pixel_values + + def get_config(self): + return PixtralVisionConfig( + image_size=self.image_size, + patch_size=self.patch_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + projection_dim=self.projection_dim, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + dropout=self.dropout, + attention_dropout=self.attention_dropout, + initializer_range=self.initializer_range, + ) + + def create_and_check_model(self, config, pixel_values): + model = PixtralModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_model_with_projection(self, config, pixel_values): + model = PixtralModel(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values) + # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) + image_size = (self.image_size, self.image_size) + patch_size = (self.patch_size, self.patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size)) + self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class PixtralModelModelTest(ModelTesterMixin, unittest.TestCase): + """ + Model tester for `PixtralModel`. + """ + + all_model_classes = (PixtralModel,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + + def setUp(self): + self.model_tester = PixtralModelTester(self) + self.config_tester = ConfigTester(self, config_class=PixtralVisionConfig, has_text_modality=False) + + @unittest.skip("model does not support input embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip("model does not support input embeds") + def test_inputs_embeds_matches_input_ids(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="Compile not yet supported because in Pixtral models") + def test_sdpa_can_compile_dynamic(self): + pass + + @unittest.skip(reason="Compile not yet supported because in Pixtral models") + def test_sdpa_can_dispatch_on_flash(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_attention_outputs(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_cpu_offload(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_batching_equivalence(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_disk_offload_bin(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_multi_gpu_data_parallel_forward(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_model_parallelism(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_model_outputs_equivalence(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_save_load(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_model_main_input_name(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_initialization(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_gradient_checkpointing_backward_compatibility(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_feed_forward_chunking(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_disk_offload_safetensors(self): + pass + + @unittest.skip(reason="Not supported yet") + def test_determinism(self): + pass + + +@require_torch +class PixtralModelIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("hf-internal-testing/pixtral-12b") + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + + @slow + @require_bitsandbytes + def test_small_model_integration_test(self): + # Let' s make sure we test the preprocessing to replace what is used + model = PixtralModel.from_pretrained("hf-internal-testing/pixtral-12b", load_in_4bit=True) + + prompt = "[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]" + image_file = "https://pixtral-vl.github.io/static/images/view.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = self.processor(prompt, raw_image, return_tensors="pt") + + EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip + self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS)) + + output = model.generate(**inputs, max_new_tokens=20) + EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly," # fmt: skip + + self.assertEqual( + self.processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py new file mode 100644 index 00000000000000..b70cab1c074480 --- /dev/null +++ b/tests/models/pixtral/test_processor_pixtral.py @@ -0,0 +1,233 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import requests +import torch + +from transformers.testing_utils import require_vision +from transformers.utils import is_vision_available + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoTokenizer, PixtralImageProcessor, PixtralProcessor + + +@require_vision +class PixtralProcessorTest(unittest.TestCase): + processor_class = PixtralProcessor + + @classmethod + def setUpClass(cls): + cls.url_0 = "https://www.ilankelman.org/stopsigns/australia.jpg" + cls.image_0 = Image.open(requests.get(cls.url_0, stream=True).raw) + cls.url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg" + cls.image_1 = Image.open(requests.get(cls.url_1, stream=True).raw) + cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg" + cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw) + + def setUp(self): + super().setUp() + + # FIXME - just load the processor directly from the checkpoint + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/pixtral-12b") + image_processor = PixtralImageProcessor() + self.processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor) + + @unittest.skip("No chat template was set for this model (yet)") + def test_chat_template(self): + expected_prompt = "USER: [IMG]\nWhat is shown in this image? ASSISTANT:" + + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + ] + formatted_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True) + self.assertEqual(expected_prompt, formatted_prompt) + + @unittest.skip("No chat template was set for this model (yet)") + def test_image_token_filling(self): + # Important to check with non square image + image = torch.randint(0, 2, (3, 500, 316)) + expected_image_tokens = 1526 + image_token_index = 32000 + + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What is shown in this image?"}, + ], + }, + ] + inputs = self.processor( + text=[self.processor.apply_chat_template(messages)], + images=[image], + return_tensors="pt", + ) + image_tokens = (inputs["input_ids"] == image_token_index).sum().item() + self.assertEqual(expected_image_tokens, image_tokens) + + def test_processor_with_single_image(self): + prompt_string = "USER: [IMG]\nWhat's the content of the image? ASSISTANT:" + + # Make small for checking image token expansion + self.processor.image_processor.size = {"longest_edge": 30} + self.processor.image_processor.patch_size = {"height": 2, "width": 2} + + # Test passing in an image + inputs_image = self.processor(text=prompt_string, images=self.image_0, return_tensors="pt") + self.assertIn("input_ids", inputs_image) + self.assertTrue(len(inputs_image["input_ids"]) == 1) + self.assertIsInstance(inputs_image["input_ids"], torch.Tensor) + self.assertIsInstance(inputs_image["pixel_values"], list) + self.assertTrue(len(inputs_image["pixel_values"]) == 1) + self.assertIsInstance(inputs_image["pixel_values"][0], list) + self.assertTrue(len(inputs_image["pixel_values"][0]) == 1) + self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor) + + # fmt: off + input_ids = inputs_image["input_ids"] + self.assertEqual( + input_ids[0].tolist(), + # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:" + [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058] + ) + # fmt: on + + # Test passing in a url + inputs_url = self.processor(text=prompt_string, images=self.url_0, return_tensors="pt") + self.assertIn("input_ids", inputs_url) + self.assertTrue(len(inputs_url["input_ids"]) == 1) + self.assertIsInstance(inputs_url["input_ids"], torch.Tensor) + self.assertIsInstance(inputs_url["pixel_values"], list) + self.assertTrue(len(inputs_url["pixel_values"]) == 1) + self.assertIsInstance(inputs_url["pixel_values"][0], list) + self.assertTrue(len(inputs_url["pixel_values"][0]) == 1) + self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor) + + # fmt: off + input_ids = inputs_url["input_ids"] + self.assertEqual( + input_ids[0].tolist(), + # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:" + [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058] + ) + # fmt: on + + def test_processor_with_multiple_images_single_list(self): + prompt_string = "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:" + + # Make small for checking image token expansion + self.processor.image_processor.size = {"longest_edge": 30} + self.processor.image_processor.patch_size = {"height": 2, "width": 2} + + # Test passing in an image + inputs_image = self.processor(text=prompt_string, images=[self.image_0, self.image_1], return_tensors="pt") + self.assertIn("input_ids", inputs_image) + self.assertTrue(len(inputs_image["input_ids"]) == 1) + self.assertIsInstance(inputs_image["input_ids"], torch.Tensor) + self.assertIsInstance(inputs_image["pixel_values"], list) + self.assertTrue(len(inputs_image["pixel_values"]) == 1) + self.assertIsInstance(inputs_image["pixel_values"][0], list) + self.assertTrue(len(inputs_image["pixel_values"][0]) == 2) + self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor) + + # fmt: off + input_ids = inputs_image["input_ids"] + self.assertEqual( + input_ids[0].tolist(), + # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"] + [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] + ) + # fmt: on + + # Test passing in a url + inputs_url = self.processor(text=prompt_string, images=[self.url_0, self.url_1], return_tensors="pt") + self.assertIn("input_ids", inputs_url) + self.assertTrue(len(inputs_url["input_ids"]) == 1) + self.assertIsInstance(inputs_url["input_ids"], torch.Tensor) + self.assertIsInstance(inputs_url["pixel_values"], list) + self.assertTrue(len(inputs_url["pixel_values"]) == 1) + self.assertIsInstance(inputs_url["pixel_values"][0], list) + self.assertTrue(len(inputs_url["pixel_values"][0]) == 2) + self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor) + # fmt: off + input_ids = inputs_url["input_ids"] + self.assertEqual( + input_ids[0].tolist(), + # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"] + [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] + ) + # fmt: on + + def test_processor_with_multiple_images_multiple_lists(self): + prompt_string = [ + "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:", + "USER: [IMG]\nWhat's the content of the image? ASSISTANT:", + ] + self.processor.tokenizer.pad_token = "" + image_inputs = [[self.image_0, self.image_1], [self.image_2]] + + # Make small for checking image token expansion + self.processor.image_processor.size = {"longest_edge": 30} + self.processor.image_processor.patch_size = {"height": 2, "width": 2} + + # Test passing in an image + inputs_image = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True) + self.assertIn("input_ids", inputs_image) + self.assertTrue(len(inputs_image["input_ids"]) == 2) + self.assertIsInstance(inputs_image["input_ids"], torch.Tensor) + self.assertIsInstance(inputs_image["pixel_values"], list) + self.assertTrue(len(inputs_image["pixel_values"]) == 2) + self.assertIsInstance(inputs_image["pixel_values"][0], list) + self.assertTrue(len(inputs_image["pixel_values"][0]) == 2) + self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor) + + # fmt: off + input_ids = inputs_image["input_ids"] + self.assertEqual( + input_ids[0].tolist(), + # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"] + [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] + ) + # fmt: on + + # Test passing in a url + inputs_url = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True) + self.assertIn("input_ids", inputs_url) + self.assertTrue(len(inputs_url["input_ids"]) == 2) + self.assertIsInstance(inputs_url["input_ids"], torch.Tensor) + self.assertIsInstance(inputs_url["pixel_values"], list) + self.assertTrue(len(inputs_url["pixel_values"]) == 2) + self.assertIsInstance(inputs_url["pixel_values"][0], list) + self.assertTrue(len(inputs_url["pixel_values"][0]) == 2) + self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor) + + # fmt: off + input_ids = inputs_url["input_ids"] + self.assertEqual( + input_ids[0].tolist(), + # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"] + [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] + ) + # fmt: on From 95e816f2bca48de32167ce6243e6770dee23923d Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 16 Sep 2024 09:44:57 +0100 Subject: [PATCH 09/67] Cohere: update RoPE structure (#33408) --- .../models/cohere/configuration_cohere.py | 43 +++++ .../models/cohere/modeling_cohere.py | 170 ++++++++++++++---- src/transformers/models/dbrx/modeling_dbrx.py | 2 +- .../models/gemma/modeling_gemma.py | 2 +- .../models/granite/modeling_granite.py | 2 +- .../models/llama/configuration_llama.py | 2 +- .../models/llama/modeling_llama.py | 2 +- .../models/mistral/modeling_mistral.py | 2 +- .../models/mixtral/modeling_mixtral.py | 2 +- src/transformers/models/olmo/modeling_olmo.py | 2 +- .../models/olmoe/modeling_olmoe.py | 2 +- .../models/persimmon/modeling_persimmon.py | 2 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi3/modeling_phi3.py | 2 +- .../models/qwen2/modeling_qwen2.py | 2 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 2 +- .../models/stablelm/modeling_stablelm.py | 2 +- .../models/starcoder2/modeling_starcoder2.py | 2 +- 18 files changed, 190 insertions(+), 55 deletions(-) diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index 73973bfad60b93..3c1237e5113789 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -20,6 +20,7 @@ """Cohere model configuration""" from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation from ...utils import logging @@ -79,6 +80,43 @@ class CohereConfig(PretrainedConfig): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -121,6 +159,7 @@ def __init__( eos_token_id=255001, tie_word_embeddings=True, rope_theta=10000.0, + rope_scaling=None, attention_bias=False, attention_dropout=0.0, use_qk_norm=False, @@ -144,10 +183,14 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache self.rope_theta = rope_theta + self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_qk_norm = use_qk_norm + # Validate the correctness of rotary position embeddings parameters + rope_config_validation(self) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 4010d9ec3a4327..ae84a9ec2d1a43 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -37,6 +37,7 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( @@ -135,35 +136,97 @@ def forward(self, hidden_states): class CohereRotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + # Note: the forward pass of this RoPE is slightly different from Llama's, resulting in different `sin`/`cos` for + # the same parameterization. The differences are highlighted with a comment. + + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[CohereConfig] = None, + ): super().__init__() - self.scaling_factor = scaling_factor - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`CohereRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.46" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + def _dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn( + self.config, device, seq_len=seq_len, **self.rope_kwargs + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len @torch.no_grad() def forward(self, x, position_ids): - # x: [bs, num_attention_heads, seq_len, head_size] + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids, device=x.device) + + # Core RoPE block inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) position_ids_expanded = position_ids[:, None, :].float() - - # Force float32 since bfloat16 loses precision on long contexts - # See https://github.com/huggingface/transformers/pull/29285 + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) device_type = x.device.type device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" with torch.autocast(device_type=device_type, enabled=False): freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.repeat_interleave(freqs, 2, dim=-1) + emb = torch.repeat_interleave(freqs, 2, dim=-1) # This line differs from Llama's implementation cos = emb.cos() sin = emb.sin() - return cos, sin + + # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) def rotate_half(x): - # Split and rotate + # Split and rotate. Note that this function is different from e.g. Llama. x1 = x[..., ::2] x2 = x[..., 1::2] rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2) @@ -272,17 +335,10 @@ def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None): self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) - self._init_rope() - # Ignore copy - def _init_rope(self): - self.rotary_emb = CohereRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) + # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers) + self.rotary_emb = CohereRotaryEmbedding(config=self.config) - # Ignore copy def forward( self, hidden_states: torch.Tensor, @@ -292,6 +348,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -310,7 +367,16 @@ def forward( key_states = key_states.transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -350,8 +416,7 @@ def forward( return attn_output, attn_weights, past_key_value -# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere -# TODO(joao): add me back asap :) +# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere class CohereFlashAttention2(CohereAttention): """ Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays @@ -377,6 +442,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if isinstance(past_key_value, StaticCache): @@ -402,7 +468,16 @@ def forward( key_states = key_states.transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -418,7 +493,6 @@ def forward( dropout_rate = self.attention_dropout if self.training else 0.0 - # Ignore copy # In PEFT, usually we cast the layer norms in float32 for training stability reasons # therefore the input hidden states gets silently casted in float32. Hence, we need # cast them back in the correct dtype just to be sure everything works as expected. @@ -465,8 +539,6 @@ def forward( return attn_output, attn_weights, past_key_value -# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere -# TODO(joao): add me back asap :) class CohereSdpaAttention(CohereAttention): """ Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from @@ -474,7 +546,6 @@ class CohereSdpaAttention(CohereAttention): SDPA API. """ - # Ignore copy def forward( self, hidden_states: torch.Tensor, @@ -484,6 +555,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if output_attentions: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. @@ -517,7 +589,16 @@ def forward( key_states = key_states.transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -587,6 +668,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -601,6 +683,11 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. """ residual = hidden_states @@ -615,6 +702,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, ) # Fully Connected @@ -755,8 +843,7 @@ def _init_weights(self, module): "The bare Cohere Model outputting raw hidden-states without any specific head on top.", COHERE_START_DOCSTRING, ) -# copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere -# TODO(joao): add me back asap :) +# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere, LLAMA->COHERE class CohereModel(CoherePreTrainedModel): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`] @@ -776,6 +863,7 @@ def __init__(self, config: CohereConfig): [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) + self.rotary_emb = CohereRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -787,14 +875,13 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embed_tokens = value - # Ignore copy @add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING) def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, @@ -823,30 +910,33 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) - past_seen_tokens = 0 return_legacy_cache = False if ( use_cache and not isinstance(past_key_values, Cache) and not self.training ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " + "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" + ) if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 cache_position = torch.arange( past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device ) - if position_ids is None: position_ids = cache_position.unsqueeze(0) causal_mask = self._update_causal_mask( attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions ) - - # embed positions hidden_states = inputs_embeds + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None @@ -866,6 +956,7 @@ def forward( output_attentions, use_cache, cache_position, + position_embeddings, ) else: layer_outputs = decoder_layer( @@ -876,6 +967,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, ) hidden_states = layer_outputs[0] diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 8db9f6e8b7d09f..43bac44ba1be20 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -1066,7 +1066,7 @@ def forward( return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 085751cd9bc039..b14e0a4b3d8ca5 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -862,7 +862,7 @@ def forward( return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index ff10b6e6d875f9..876f5ed2a7c8da 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -839,7 +839,7 @@ def forward( return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 435f0091e06e70..a3667e06534564 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -192,7 +192,7 @@ def __init__( self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. + # BC: if there is a 'type' field, copy it it to 'rope_type'. if self.rope_scaling is not None and "type" in self.rope_scaling: self.rope_scaling["rope_type"] = self.rope_scaling["type"] rope_config_validation(self) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 9a1d6c0749f932..c7017832b9324c 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -951,7 +951,7 @@ def forward( return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index c43418182c3881..ffe16b27203301 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -767,7 +767,7 @@ def forward( past_key_values = DynamicCache.from_legacy_cache(past_key_values) return_legacy_cache = True logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 2e23d06699087e..c7062e75b1085c 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1023,7 +1023,7 @@ def forward( use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 007e69570e7821..b4bda8e2db5251 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -873,7 +873,7 @@ def forward( return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index a53f1eeda61196..a33338365312db 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -1012,7 +1012,7 @@ def forward( return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 9fab09bdcc7877..ccaa2c7fd29aae 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -690,7 +690,7 @@ def forward( use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 0d8be04af20d5c..648d1653a3b503 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -981,7 +981,7 @@ def forward( use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 273b6a8f505e79..ec395679ae6207 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1008,7 +1008,7 @@ def forward( use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 030c74b034b794..d0ea8ef0e376e0 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -920,7 +920,7 @@ def forward( use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index b196ed72a49b23..6f483e50cde065 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1084,7 +1084,7 @@ def forward( use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 27d0c856a61bd6..d91c0832ed33da 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -965,7 +965,7 @@ def forward( use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index c359c07c69c0b8..0be37c4e1fb91c 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -894,7 +894,7 @@ def forward( use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. " + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. " "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)" ) From 5ce0a113b5bc9dd8dbb92dd866772d79847d9a92 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 16 Sep 2024 11:07:59 +0200 Subject: [PATCH 10/67] Fix SSH workflow (#33451) * fix * update --------- Co-authored-by: ydshieh --- .github/workflows/ssh-runner.yml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml index b433abb484fac4..db649876f60492 100644 --- a/.github/workflows/ssh-runner.yml +++ b/.github/workflows/ssh-runner.yml @@ -58,8 +58,19 @@ jobs: #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step shell: bash run: | - if [ "${{ secrets[format('{0}_{1}', github.actor, 'SLACK_ID')] }}" != "" ]; then - echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', github.actor, 'SLACK_ID')] }}" >> $GITHUB_ENV + echo "${{ github.actor }}" + github_actor=${{ github.actor }} + github_actor=${github_actor/'-'/'_'} + echo "$github_actor" + echo "github_actor=$github_actor" >> $GITHUB_ENV + + - name: Store Slack infos + #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step + shell: bash + run: | + echo "${{ env.github_actor }}" + if [ "${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" != "" ]; then + echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" >> $GITHUB_ENV else echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV fi From ce62a41880b5b70a304d068eb58f55894a5a7af8 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Mon, 16 Sep 2024 13:08:31 +0200 Subject: [PATCH 11/67] Add keypoint-detection task guide (#33274) --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/_toctree.yml | 2 + docs/source/en/tasks/keypoint_detection.md | 154 +++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 docs/source/en/tasks/keypoint_detection.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 235ea81a7f1ea6..7eff2a38302669 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -81,6 +81,8 @@ title: Image Feature Extraction - local: tasks/mask_generation title: Mask Generation + - local: tasks/keypoint_detection + title: Keypoint Detection - local: tasks/knowledge_distillation_for_image_classification title: Knowledge Distillation for Computer Vision title: Computer Vision diff --git a/docs/source/en/tasks/keypoint_detection.md b/docs/source/en/tasks/keypoint_detection.md new file mode 100644 index 00000000000000..a0ec71a5c22000 --- /dev/null +++ b/docs/source/en/tasks/keypoint_detection.md @@ -0,0 +1,154 @@ + + +# Keypoint Detection + +[[open-in-colab]] + +Keypoint detection identifies and locates specific points of interest within an image. These keypoints, also known as landmarks, represent meaningful features of objects, such as facial features or object parts. These models take an image input and return the following outputs: + +- **Keypoints and Scores**: Points of interest and their confidence scores. +- **Descriptors**: A representation of the image region surrounding each keypoint, capturing its texture, gradient, orientation and other properties. + +In this guide, we will show how to extract keypoints from images. + +For this tutorial, we will use [SuperPoint](./model_doc/superpoint.md), a foundation model for keypoint detection. + +```python +from transformers import AutoImageProcessor, SuperPointForKeypointDetection +processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint") +model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint") +``` + +Let's test the model on the images below. + +
+ Bee + Cats +
+ + +```python +import torch +from PIL import Image +import requests +import cv2 + + +url_image_1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg" +image_1 = Image.open(requests.get(url_image_1, stream=True).raw) +url_image_2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png" +image_2 = Image.open(requests.get(url_image_2, stream=True).raw) + +images = [image_1, image_2] +``` + +We can now process our inputs and infer. + +```python +inputs = processor(images,return_tensors="pt").to(model.device, model.dtype) +outputs = model(**inputs) +``` + +The model output has relative keypoints, descriptors, masks and scores for each item in the batch. The mask highlights areas of the image where keypoints are present. + +```python +SuperPointKeypointDescriptionOutput(loss=None, keypoints=tensor([[[0.0437, 0.0167], + [0.0688, 0.0167], + [0.0172, 0.0188], + ..., + [0.5984, 0.9812], + [0.6953, 0.9812]]]), + scores=tensor([[0.0056, 0.0053, 0.0079, ..., 0.0125, 0.0539, 0.0377], + [0.0206, 0.0058, 0.0065, ..., 0.0000, 0.0000, 0.0000]], + grad_fn=), descriptors=tensor([[[-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357], + [-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357], + [-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357], + ...], + grad_fn=), mask=tensor([[1, 1, 1, ..., 1, 1, 1], + [1, 1, 1, ..., 0, 0, 0]], dtype=torch.int32), hidden_states=None) +``` + +To plot actual keypoints in the image, we need to postprocess the output. To do so, we have to pass the actual image sizes to `post_process_keypoint_detection` along with outputs. + +```python +image_sizes = [(image.size[1], image.size[0]) for image in images] +outputs = processor.post_process_keypoint_detection(outputs, image_sizes) +``` + +The outputs are now a list of dictionaries where each dictionary is a processed output of keypoints, scores and descriptors. + +```python +[{'keypoints': tensor([[ 226, 57], + [ 356, 57], + [ 89, 64], + ..., + [3604, 3391]], dtype=torch.int32), + 'scores': tensor([0.0056, 0.0053, ...], grad_fn=), + 'descriptors': tensor([[-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357], + [-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357]], + grad_fn=)}, + {'keypoints': tensor([[ 46, 6], + [ 78, 6], + [422, 6], + [206, 404]], dtype=torch.int32), + 'scores': tensor([0.0206, 0.0058, 0.0065, 0.0053, 0.0070, ...,grad_fn=), + 'descriptors': tensor([[-0.0525, 0.0726, 0.0270, ..., 0.0389, -0.0189, -0.0211], + [-0.0525, 0.0726, 0.0270, ..., 0.0389, -0.0189, -0.0211]}] +``` + +We can use these to plot the keypoints. + +```python +import matplotlib.pyplot as plt +import torch + +for i in range(len(images)): + keypoints = outputs[i]["keypoints"] + scores = outputs[i]["scores"] + descriptors = outputs[i]["descriptors"] + keypoints = outputs[i]["keypoints"].detach().numpy() + scores = outputs[i]["scores"].detach().numpy() + image = images[i] + image_width, image_height = image.size + + plt.axis('off') + plt.imshow(image) + plt.scatter( + keypoints[:, 0], + keypoints[:, 1], + s=scores * 100, + c='cyan', + alpha=0.4 + ) + plt.show() +``` + +Below you can see the outputs. + +
+ Bee + Cats +
+ From 2f62146f0e916c3e6752b59d34853be6df0506f2 Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Mon, 16 Sep 2024 11:26:26 -0400 Subject: [PATCH 12/67] Uniformize kwargs for LLaVa processor and update docs (#32858) * Uniformize kwargs for LlaVa and update docs * Change order of processor inputs in docstring * Improve BC support for reversed images and text inputs * cleanup llava processor call docstring * Add encoded inputs as valid text inputs in reverse input check, add deprecation version in warning * Put function check reversed images text outside base processor class * Refactor _validate_images_text_input_order * Add ProcessingUtilTester * fix processing and test_processing --- .../models/llava/modeling_llava.py | 2 +- .../models/llava/processing_llava.py | 73 ++++++++++--------- tests/models/llava/test_modeling_llava.py | 20 ++--- tests/models/llava/test_processor_llava.py | 57 ++++++++++++++- 4 files changed, 104 insertions(+), 48 deletions(-) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 9ad19ccee72228..eb1c55341b0784 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -405,7 +405,7 @@ def forward( >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> inputs = processor(text=prompt, images=image, return_tensors="pt") + >>> inputs = processor(images=image, text=prompt, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(**inputs, max_new_tokens=15) diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 678724ae95be41..28a9410e6cbf0b 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -16,18 +16,33 @@ Processor class for Llava. """ -from typing import List, Optional, Union +import sys +from typing import List, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, get_image_size, to_numpy_array -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from ...utils import TensorType, logging +from ...processing_utils import ProcessingKwargs, ProcessorMixin, _validate_images_text_input_order +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + logger = logging.get_logger(__name__) +class LlavaProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": {}, + } + + class LlavaProcessor(ProcessorMixin): r""" Constructs a Llava processor which wraps a Llava image processor and a Llava tokenizer into a single processor. @@ -73,12 +88,11 @@ def __init__( def __call__( self, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, images: ImageInput = None, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length=None, - return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[LlavaProcessorKwargs], ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` @@ -88,29 +102,15 @@ def __call__( of the above two methods for more information. Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. text (`str`, `List[str]`, `List[List[str]]`): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding - index) among: - - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single - sequence if provided). - - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum - acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different - lengths). - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding length (see above). - truncation (`bool`, *optional*): - Activates truncation to cut input sequences longer than `max_length` to `max_length`. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: - - `'tf'`: Return TensorFlow `tf.constant` objects. - `'pt'`: Return PyTorch `torch.Tensor` objects. - `'np'`: Return NumPy `np.ndarray` objects. @@ -125,8 +125,19 @@ def __call__( `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + if images is None and text is None: + raise ValueError("You have to specify at least one of `images` or `text`.") + + # check if images and text inputs are reversed for BC + images, text = _validate_images_text_input_order(images, text) + + output_kwargs = self._merge_kwargs( + LlavaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) if images is not None: - image_inputs = self.image_processor(images, return_tensors=return_tensors) + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) else: image_inputs = {} @@ -158,13 +169,7 @@ def __call__( "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." ) - text_inputs = self.tokenizer( - prompt_strings, - return_tensors=return_tensors, - padding=padding, - truncation=truncation, - max_length=max_length, - ) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) return BatchFeature(data={**text_inputs, **image_inputs}) # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 5c05480ffa6dbb..305fc9e9a84cdb 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -274,7 +274,7 @@ def test_small_model_integration_test(self): prompt = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:" image_file = "https://llava-vl.github.io/static/images/view.jpg" raw_image = Image.open(requests.get(image_file, stream=True).raw) - inputs = self.processor(prompt, raw_image, return_tensors="pt") + inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt") EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS)) @@ -299,7 +299,7 @@ def test_small_model_integration_test_llama_single(self): prompt = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT:" image_file = "https://llava-vl.github.io/static/images/view.jpg" raw_image = Image.open(requests.get(image_file, stream=True).raw) - inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16) + inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) output = model.generate(**inputs, max_new_tokens=900, do_sample=False) EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip @@ -325,7 +325,7 @@ def test_small_model_integration_test_llama_batched(self): image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw) image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) - inputs = processor(prompts, images=[image1, image2], return_tensors="pt", padding=True) + inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True) output = model.generate(**inputs, max_new_tokens=20) @@ -349,7 +349,7 @@ def test_small_model_integration_test_batch(self): image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw) image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) - inputs = self.processor(prompts, images=[image1, image2], return_tensors="pt", padding=True) + inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True) output = model.generate(**inputs, max_new_tokens=20) @@ -381,7 +381,7 @@ def test_small_model_integration_test_llama_batched_regression(self): image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw) image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) - inputs = processor(prompts, images=[image1, image2, image1], return_tensors="pt", padding=True) + inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True) output = model.generate(**inputs, max_new_tokens=20) @@ -409,8 +409,8 @@ def test_batched_generation(self): image2 = Image.open(requests.get(url2, stream=True).raw) inputs = processor( - text=[prompt1, prompt2, prompt3], images=[image1, image2, image1, image2], + text=[prompt1, prompt2, prompt3], return_tensors="pt", padding=True, ).to(torch_device) @@ -444,7 +444,7 @@ def test_llava_index_error_bug(self): image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" raw_image = Image.open(requests.get(image_file, stream=True).raw) - inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16) + inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) # Make sure that `generate` works _ = model.generate(**inputs, max_new_tokens=20) @@ -510,7 +510,7 @@ def test_generation_no_images(self): processor = AutoProcessor.from_pretrained(model_id) # Prepare inputs with no images - inputs = processor("Hello, I am", return_tensors="pt").to(torch_device) + inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device) # Make sure that `generate` works _ = model.generate(**inputs, max_new_tokens=20) @@ -554,13 +554,13 @@ def test_expansion_in_processing(self): # check processing with expansion of inputs processor.vision_feature_select_strategy = "default" processor.patch_size = 14 - inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16) + inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593) # check processing without expansion of inputs (legacy behavior) processor.vision_feature_select_strategy = None processor.patch_size = None - inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16) + inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) self.assertTrue(inputs.input_ids.shape[-1] == 18) # generate exactly 20 tokens diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py index 54c1b4674cbcef..5b05a8b92ea513 100644 --- a/tests/models/llava/test_processor_llava.py +++ b/tests/models/llava/test_processor_llava.py @@ -11,18 +11,43 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import shutil +import tempfile import unittest -from transformers.testing_utils import require_vision +from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor +from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_vision_available(): - from transformers import AutoTokenizer, LlavaProcessor + from transformers import CLIPImageProcessor @require_vision -class LlavaProcessorTest(unittest.TestCase): +class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = LlavaProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + image_processor = CLIPImageProcessor(do_center_crop=False) + tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b") + + processor = LlavaProcessor(image_processor=image_processor, tokenizer=tokenizer) + + processor.save_pretrained(self.tmpdirname) + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + def test_can_load_various_tokenizers(self): for checkpoint in ["Intel/llava-gemma-2b", "llava-hf/llava-1.5-7b-hf"]: processor = LlavaProcessor.from_pretrained(checkpoint) @@ -45,3 +70,29 @@ def test_chat_template(self): formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) self.assertEqual(expected_prompt, formatted_prompt) + + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = ["lower newer", "upper older longer string"] + image_input = self.prepare_image_inputs() * 2 + inputs = processor( + images=image_input, + text=input_str, + return_tensors="pt", + size={"height": 214, "width": 214}, + padding="longest", + max_length=76, + ) + + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 5) From c7a91f5adf976e0517c4a7f1506fb0c24f353053 Mon Sep 17 00:00:00 2001 From: Sergio Paniego Blanco Date: Mon, 16 Sep 2024 18:52:27 +0200 Subject: [PATCH 13/67] `Agents, supercharged - Multi-agents, External tools, and more` docs typo fixed (#33478) * Typo fixed in Agents, supercharged --- docs/source/en/agents_advanced.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md index e7469a310c4102..399eeb9b70eb20 100644 --- a/docs/source/en/agents_advanced.md +++ b/docs/source/en/agents_advanced.md @@ -34,7 +34,7 @@ You can easily build hierarchical multi-agent systems with `transformers.agents` To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools. -Here's an example of making an agent that managed a specitif web search agent using our [`DuckDuckGoSearchTool`]: +Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]: ```py from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent From c2d05897bf4e8b34773838accaddd66028bc148d Mon Sep 17 00:00:00 2001 From: Ahmed Almaghz <53489256+AhmedAlmaghz@users.noreply.github.com> Date: Mon, 16 Sep 2024 20:02:03 +0300 Subject: [PATCH 14/67] [i18n-ar] Add File : `docs/source/ar/_toctree.yml` (#32696) * Update ar lang build_documentation.yml * Update ar lang build_pr_documentation.yml * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/pipeline_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/autoclass_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/preprocessing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/training.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/run_scripts.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/run_scripts.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/run_scripts.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/run_scripts.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/run_scripts.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/run_scripts.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/run_scripts.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/accelerate.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/accelerate.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/accelerate.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/accelerate.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/accelerate.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/accelerate.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Create _config.py * Update _toctree.yml * Update _toctree.yml * Update docs/source/ar/peft.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/peft.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/peft.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/peft.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/peft.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/peft.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/peft.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/peft.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/peft.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update _toctree.yml * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/model_sharing.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/conversations.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/agents.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/llm_tutorial.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update llm_tutorial.md * Update _toctree.yml * Update autoclass_tutorial.md * Update autoclass_tutorial.md * Update preprocessing.md * Update glossary.md * Update run_scripts.md * Update run_scripts.md * Update run_scripts.md --------- Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> --- .github/workflows/build_documentation.yml | 2 +- .github/workflows/build_pr_documentation.yml | 2 +- docs/source/ar/_config.py | 14 + docs/source/ar/_toctree.yml | 892 +++++++++++++++++++ docs/source/ar/accelerate.md | 120 +++ docs/source/ar/agents.md | 539 +++++++++++ docs/source/ar/autoclass_tutorial.md | 167 ++++ docs/source/ar/conversations.md | 204 +++++ docs/source/ar/glossary.md | 446 ++++++++++ docs/source/ar/index.md | 342 +++++++ docs/source/ar/installation.md | 246 +++++ docs/source/ar/llm_tutorial.md | 248 ++++++ docs/source/ar/model_sharing.md | 223 +++++ docs/source/ar/peft.md | 250 ++++++ docs/source/ar/pipeline_tutorial.md | 315 +++++++ docs/source/ar/preprocessing.md | 521 +++++++++++ docs/source/ar/quicktour.md | 543 +++++++++++ docs/source/ar/run_scripts.md | 351 ++++++++ docs/source/ar/training.md | 412 +++++++++ 19 files changed, 5835 insertions(+), 2 deletions(-) create mode 100644 docs/source/ar/_config.py create mode 100644 docs/source/ar/_toctree.yml create mode 100644 docs/source/ar/accelerate.md create mode 100644 docs/source/ar/agents.md create mode 100644 docs/source/ar/autoclass_tutorial.md create mode 100644 docs/source/ar/conversations.md create mode 100644 docs/source/ar/glossary.md create mode 100644 docs/source/ar/index.md create mode 100644 docs/source/ar/installation.md create mode 100644 docs/source/ar/llm_tutorial.md create mode 100644 docs/source/ar/model_sharing.md create mode 100644 docs/source/ar/peft.md create mode 100644 docs/source/ar/pipeline_tutorial.md create mode 100644 docs/source/ar/preprocessing.md create mode 100644 docs/source/ar/quicktour.md create mode 100644 docs/source/ar/run_scripts.md create mode 100644 docs/source/ar/training.md diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index e3e3b5f2df37f1..b25567fb092a14 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -15,7 +15,7 @@ jobs: commit_sha: ${{ github.sha }} package: transformers notebook_folder: transformers_doc - languages: de en es fr hi it ko pt tr zh ja te + languages: ar de en es fr hi it ko pt tr zh ja te custom_container: huggingface/transformers-doc-builder secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index c8d073ea34688f..f698f860b2f93c 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -14,5 +14,5 @@ jobs: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} package: transformers - languages: de en es fr hi it ko pt tr zh ja te + languages: ar de en es fr hi it ko pt tr zh ja te custom_container: huggingface/transformers-doc-builder diff --git a/docs/source/ar/_config.py b/docs/source/ar/_config.py new file mode 100644 index 00000000000000..f49e4e4731965a --- /dev/null +++ b/docs/source/ar/_config.py @@ -0,0 +1,14 @@ +# docstyle-ignore +INSTALL_CONTENT = """ +# Transformers installation +! pip install transformers datasets evaluate accelerate +# To install from source instead of the last release, comment the command above and uncomment the following one. +# ! pip install git+https://github.com/huggingface/transformers.git +""" + +notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}] +black_avoid_patterns = { + "{processor_class}": "FakeProcessorClass", + "{model_class}": "FakeModelClass", + "{object_class}": "FakeObjectClass", +} diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml new file mode 100644 index 00000000000000..39e0ae14e19c29 --- /dev/null +++ b/docs/source/ar/_toctree.yml @@ -0,0 +1,892 @@ +- sections: + - local: index + title: 🤗 المحولات + - local: quicktour + title: جولة سريعة + - local: installation + title: التثبيت + title: البدء +- sections: + - local: pipeline_tutorial + title: تشغيل الاستنتاج باستخدام خطوط الأنابيب + - local: autoclass_tutorial + title: كتابة تعليمات برمجية متكيفه باستخدام AutoClass + - local: preprocessing + title: معالجة البيانات مسبقًا + - local: training + title: ضبط نموذج مسبق التدريب + - local: run_scripts + title: التدريب باستخدام نص برمجي + - local: accelerate + title: إعداد تدريب موزع باستخدام 🤗 Accelerate + - local: peft + title: تحميل النماذج المخصصة وتدريبها باستخدام 🤗 PEFT + - local: model_sharing + title: مشاركة نموذجك + - local: agents + title: الوكلاء + - local: llm_tutorial + title: التوليد باستخدام LLMs + - local: conversations + title: الدردشة مع المحولات + title: البرامج التعليمية +# - sections: +# - isExpanded: false +# sections: +# - local: tasks/sequence_classification +# title: تصنيف النصوص +# - local: tasks/token_classification +# title: تصنيف الرموز +# - local: tasks/question_answering +# title: الإجابة على الأسئلة +# - local: tasks/language_modeling +# title: نمذجة اللغة السببية +# - local: tasks/masked_language_modeling +# title: نمذجة اللغة المقنعة +# - local: tasks/translation +# title: الترجمة +# - local: tasks/summarization +# title: التلخيص +# - local: tasks/multiple_choice +# title: الاختيار المتعدد +# title: معالجة اللغات الطبيعية +# - isExpanded: false +# sections: +# - local: tasks/audio_classification +# title: تصنيف الصوت +# - local: tasks/asr +# title: التعرف التلقائي على الكلام +# title: الصوت +# - isExpanded: false +# sections: +# - local: tasks/image_classification +# title: تصنيف الصور +# - local: tasks/semantic_segmentation +# title: تجزئة الصور +# - local: tasks/video_classification +# title: تصنيف الفيديو +# - local: tasks/object_detection +# title: اكتشاف الأشياء +# - local: tasks/zero_shot_object_detection +# title: اكتشاف الأشياء بدون تدريب +# - local: tasks/zero_shot_image_classification +# title: تصنيف الصور بدون تدريب +# - local: tasks/monocular_depth_estimation +# title: تقدير العمق +# - local: tasks/image_to_image +# title: صورة إلى صورة +# - local: tasks/image_feature_extraction +# title: استخراج ميزات الصورة +# - local: tasks/mask_generation +# title: توليد القناع +# - local: tasks/knowledge_distillation_for_image_classification +# title: التقليل المعرفي للرؤية الحاسوبية +# title: الرؤية الحاسوبية +# - isExpanded: false +# sections: +# - local: tasks/image_captioning +# title: وصف الصور Image captioning +# - local: tasks/document_question_answering +# title: الإجابة على أسئلة المستندات +# - local: tasks/visual_question_answering +# title: الإجابة على الأسئلة المرئية +# - local: tasks/text-to-speech +# title: تحويل النص إلى كلام +# title: المتعددة الوسائط +# - isExpanded: false +# sections: +# - local: generation_strategies +# title: تخصيص استراتيجية التوليد +# - local: kv_cache +# title: أفضل الممارسات للتوليد باستخدام ذاكرة التخزين المؤقت +# title: التوليد +# - isExpanded: false +# sections: +# - local: tasks/idefics +# title: مهام الصور مع IDEFICS +# - local: tasks/prompting +# title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة +# title: الإرشاد +# title: أدلة المهام +# - sections: +# - local: fast_tokenizers +# title: استخدم برامج التجزئة السريعة من 🤗 Tokenizers +# - local: multilingual +# title: تشغيل الاستنتاج باستخدام نماذج متعددة اللغات +# - local: create_a_model +# title: استخدام واجهات برمجة التطبيقات الخاصة بالنموذج +# - local: custom_models +# title: مشاركة نموذج مخصص +# - local: chat_templating +# title: قوالب لنماذج الدردشة +# - local: trainer +# title: المدرب +# - local: sagemaker +# title: تشغيل التدريب على Amazon SageMaker +# - local: serialization +# title: التصدير إلى ONNX +# - local: tflite +# title: التصدير إلى TFLite +# - local: torchscript +# title: التصدير إلى TorchScript +# - local: benchmarks +# title: المعايير +# - local: notebooks +# title: دفاتر الملاحظات مع الأمثلة +# - local: community +# title: موارد المجتمع +# - local: troubleshooting +# title: استكشاف الأخطاء وإصلاحها +# - local: gguf +# title: التوافق مع ملفات GGUF +# title: أدلة المطورين +# - sections: +# - local: quantization/overview +# title: نظرة عامة +# - local: quantization/bitsandbytes +# title: bitsandbytes +# - local: quantization/gptq +# title: GPTQ +# - local: quantization/awq +# title: AWQ +# - local: quantization/aqlm +# title: AQLM +# - local: quantization/quanto +# title: Quanto +# - local: quantization/eetq +# title: EETQ +# - local: quantization/hqq +# title: HQQ +# - local: quantization/optimum +# title: Optimum +# - local: quantization/contribute +# title: المساهمة بطريقة جديدة للتكميم +# title: أساليب التكميم +# - sections: +# - local: performance +# title: الأداء-نظرة عامة +# - local: llm_optims +# title: تحسين الاستدلال LLM +# - sections: +# - local: perf_train_gpu_one +# title: استخدام عدة وحدات معالجة رسوميات (GPUs) بشكل متوازٍ +# - local: perf_train_gpu_many +# title: وحدات معالجة الرسومات (GPU) متعددة والتوازي +# - local: fsdp +# title: Fully Sharded Data Parallel +# - local: deepspeed +# title: DeepSpeed +# - local: perf_train_cpu +# title: التدريب الفعال على وحدة المعالجة المركزية (CPU) +# - local: perf_train_cpu_many +# title: التدريب الموزع لوحدة المعالجة المركزية (CPU) +# - local: perf_train_tpu_tf +# title: التدريب على (TPU) باستخدام TensorFlow +# - local: perf_train_special +# title: تدريب PyTorch على Apple silicon +# - local: perf_hardware +# title: الأجهزة المخصصة للتدريب +# - local: hpo_train +# title: البحث عن المعاملات المثلى باستخدام واجهة برمجة تطبيقات المدرب +# title: تقنيات التدريب الفعال +# - sections: +# - local: perf_infer_cpu +# title: الإستدلال على وحدة المعالجة المركزية (CPU) +# - local: perf_infer_gpu_one +# title: الإستدلال على وحدة معالجة الرسومات (GPU) +# title: تحسين الاستدلال +# - local: big_models +# title: إنشاء نموذج كبير +# - local: debugging +# title: تصحيح الأخطاء البرمجية +# - local: tf_xla +# title: تكامل XLA لنماذج TensorFlow +# - local: perf_torch_compile +# title: تحسين الاستدلال باستخدام `torch.compile()` +# title: الأداء وقابلية التوسع +# - sections: +# - local: contributing +# title: كيفية المساهمة في 🤗 المحولات؟ +# - local: add_new_model +# title: كيفية إضافة نموذج إلى 🤗 المحولات؟ +# - local: add_new_pipeline +# title: كيفية إضافة خط أنابيب إلى 🤗 المحولات؟ +# - local: testing +# title: الاختبار +# - local: pr_checks +# title: التحقق من طلب السحب +# title: المساهمة +- sections: + # - local: philosophy + # title: الفلسفة + - local: glossary + title: (قاموس المصطلحات (قائمة الكلمات + # - local: task_summary + # title: ما الذي يمكن أن تفعله 🤗 المحولات + # - local: tasks_explained + # title: كيف تحل المحولات المهام + # - local: model_summary + # title: عائلة نماذج المحول + # - local: tokenizer_summary + # title: ملخص برنامج مقسم النصوص (tokenizers) + # - local: attention + # title: الانتباه Attention + # - local: pad_truncation + # title: الحشو والتقليم + # - local: bertology + # title: BERTology + # - local: perplexity + # title: حيرة النماذج ذات الطول الثابت + # - local: pipeline_webserver + # title: خطوط الأنابيب للاستدلال على خادم الويب + # - local: model_memory_anatomy + # title: تشريح تدريب النموذج + # - local: llm_tutorial_optimization + # title: الاستفادة القصوى من LLMs + title: أطر مفاهيمية +# - sections: +# - sections: +# - local: main_classes/agent +# title: الوكلاء والأدوات +# - local: model_doc/auto +# title: فئات يتم إنشاؤها ديناميكيًا +# - local: main_classes/backbones +# title: العمود الفقري +# - local: main_classes/callback +# title: عمليات الاسترجاع +# - local: main_classes/configuration +# title: التكوين +# - local: main_classes/data_collator +# title: مجمع البيانات +# - local: main_classes/keras_callbacks +# title: استدعاءات Keras +# - local: main_classes/logging +# title: التسجيل +# - local: main_classes/model +# title: النماذج +# - local: main_classes/text_generation +# title: توليد النصوص +# - local: main_classes/onnx +# title: ONNX +# - local: main_classes/optimizer_schedules +# title: التحسين +# - local: main_classes/output +# title: مخرجات النموذج +# - local: main_classes/pipelines +# title: خطوط الأنابيب +# - local: main_classes/processors +# title: المعالجات +# - local: main_classes/quantization +# title: التكميم +# - local: main_classes/tokenizer +# title: برنامج مقسم النصوص +# - local: main_classes/trainer +# title: المدرب +# - local: main_classes/deepspeed +# title: DeepSpeed +# - local: main_classes/feature_extractor +# title: مستخرج الميزات +# - local: main_classes/image_processor +# title: معالج الصور +# title: الفئات الرئيسية +# - sections: +# - isExpanded: false +# sections: +# - local: model_doc/albert +# title: ALBERT +# - local: model_doc/bart +# title: BART +# - local: model_doc/barthez +# title: BARThez +# - local: model_doc/bartpho +# title: BARTpho +# - local: model_doc/bert +# title: BERT +# - local: model_doc/bert-generation +# title: BertGeneration +# - local: model_doc/bert-japanese +# title: BertJapanese +# - local: model_doc/bertweet +# title: Bertweet +# - local: model_doc/big_bird +# title: BigBird +# - local: model_doc/bigbird_pegasus +# title: BigBirdPegasus +# - local: model_doc/biogpt +# title: BioGpt +# - local: model_doc/blenderbot +# title: Blenderbot +# - local: model_doc/blenderbot-small +# title: Blenderbot Small +# - local: model_doc/bloom +# title: BLOOM +# - local: model_doc/bort +# title: BORT +# - local: model_doc/byt5 +# title: ByT5 +# - local: model_doc/camembert +# title: CamemBERT +# - local: model_doc/canine +# title: CANINE +# - local: model_doc/codegen +# title: CodeGen +# - local: model_doc/code_llama +# title: CodeLlama +# - local: model_doc/cohere +# title: Cohere +# - local: model_doc/convbert +# title: ConvBERT +# - local: model_doc/cpm +# title: CPM +# - local: model_doc/cpmant +# title: CPMANT +# - local: model_doc/ctrl +# title: CTRL +# - local: model_doc/dbrx +# title: DBRX +# - local: model_doc/deberta +# title: DeBERTa +# - local: model_doc/deberta-v2 +# title: DeBERTa-v2 +# - local: model_doc/dialogpt +# title: DialoGPT +# - local: model_doc/distilbert +# title: DistilBERT +# - local: model_doc/dpr +# title: DPR +# - local: model_doc/electra +# title: ELECTRA +# - local: model_doc/encoder-decoder +# title: Encoder Decoder Models +# - local: model_doc/ernie +# title: ERNIE +# - local: model_doc/ernie_m +# title: ErnieM +# - local: model_doc/esm +# title: ESM +# - local: model_doc/falcon +# title: Falcon +# - local: model_doc/fastspeech2_conformer +# title: FastSpeech2Conformer +# - local: model_doc/flan-t5 +# title: FLAN-T5 +# - local: model_doc/flan-ul2 +# title: FLAN-UL2 +# - local: model_doc/flaubert +# title: FlauBERT +# - local: model_doc/fnet +# title: FNet +# - local: model_doc/fsmt +# title: FSMT +# - local: model_doc/funnel +# title: Funnel Transformer +# - local: model_doc/fuyu +# title: Fuyu +# - local: model_doc/gemma +# title: Gemma +# - local: model_doc/openai-gpt +# title: GPT +# - local: model_doc/gpt_neo +# title: GPT Neo +# - local: model_doc/gpt_neox +# title: GPT NeoX +# - local: model_doc/gpt_neox_japanese +# title: GPT NeoX Japanese +# - local: model_doc/gptj +# title: GPT-J +# - local: model_doc/gpt2 +# title: GPT2 +# - local: model_doc/gpt_bigcode +# title: GPTBigCode +# - local: model_doc/gptsan-japanese +# title: GPTSAN Japanese +# - local: model_doc/gpt-sw3 +# title: GPTSw3 +# - local: model_doc/herbert +# title: HerBERT +# - local: model_doc/ibert +# title: I-BERT +# - local: model_doc/jamba +# title: Jamba +# - local: model_doc/jetmoe +# title: JetMoe +# - local: model_doc/jukebox +# title: Jukebox +# - local: model_doc/led +# title: LED +# - local: model_doc/llama +# title: LLaMA +# - local: model_doc/llama2 +# title: Llama2 +# - local: model_doc/llama3 +# title: Llama3 +# - local: model_doc/longformer +# title: Longformer +# - local: model_doc/longt5 +# title: LongT5 +# - local: model_doc/luke +# title: LUKE +# - local: model_doc/m2m_100 +# title: M2M100 +# - local: model_doc/madlad-400 +# title: MADLAD-400 +# - local: model_doc/mamba +# title: Mamba +# - local: model_doc/marian +# title: MarianMT +# - local: model_doc/markuplm +# title: MarkupLM +# - local: model_doc/mbart +# title: MBart and MBart-50 +# - local: model_doc/mega +# title: MEGA +# - local: model_doc/megatron-bert +# title: MegatronBERT +# - local: model_doc/megatron_gpt2 +# title: MegatronGPT2 +# - local: model_doc/mistral +# title: Mistral +# - local: model_doc/mixtral +# title: Mixtral +# - local: model_doc/mluke +# title: mLUKE +# - local: model_doc/mobilebert +# title: MobileBERT +# - local: model_doc/mpnet +# title: MPNet +# - local: model_doc/mpt +# title: MPT +# - local: model_doc/mra +# title: MRA +# - local: model_doc/mt5 +# title: MT5 +# - local: model_doc/mvp +# title: MVP +# - local: model_doc/nezha +# title: NEZHA +# - local: model_doc/nllb +# title: NLLB +# - local: model_doc/nllb-moe +# title: NLLB-MoE +# - local: model_doc/nystromformer +# title: Nyströmformer +# - local: model_doc/olmo +# title: OLMo +# - local: model_doc/open-llama +# title: Open-Llama +# - local: model_doc/opt +# title: OPT +# - local: model_doc/pegasus +# title: Pegasus +# - local: model_doc/pegasus_x +# title: PEGASUS-X +# - local: model_doc/persimmon +# title: Persimmon +# - local: model_doc/phi +# title: Phi +# - local: model_doc/phi3 +# title: Phi-3 +# - local: model_doc/phobert +# title: PhoBERT +# - local: model_doc/plbart +# title: PLBart +# - local: model_doc/prophetnet +# title: ProphetNet +# - local: model_doc/qdqbert +# title: QDQBert +# - local: model_doc/qwen2 +# title: Qwen2 +# - local: model_doc/qwen2_moe +# title: Qwen2MoE +# - local: model_doc/rag +# title: RAG +# - local: model_doc/realm +# title: REALM +# - local: model_doc/recurrent_gemma +# title: RecurrentGemma +# - local: model_doc/reformer +# title: Reformer +# - local: model_doc/rembert +# title: RemBERT +# - local: model_doc/retribert +# title: RetriBERT +# - local: model_doc/roberta +# title: RoBERTa +# - local: model_doc/roberta-prelayernorm +# title: RoBERTa-PreLayerNorm +# - local: model_doc/roc_bert +# title: RoCBert +# - local: model_doc/roformer +# title: RoFormer +# - local: model_doc/rwkv +# title: RWKV +# - local: model_doc/splinter +# title: Splinter +# - local: model_doc/squeezebert +# title: SqueezeBERT +# - local: model_doc/stablelm +# title: StableLm +# - local: model_doc/starcoder2 +# title: Starcoder2 +# - local: model_doc/switch_transformers +# title: SwitchTransformers +# - local: model_doc/t5 +# title: T5 +# - local: model_doc/t5v1.1 +# title: T5v1.1 +# - local: model_doc/tapex +# title: TAPEX +# - local: model_doc/transfo-xl +# title: Transformer XL +# - local: model_doc/ul2 +# title: UL2 +# - local: model_doc/umt5 +# title: UMT5 +# - local: model_doc/xmod +# title: X-MOD +# - local: model_doc/xglm +# title: XGLM +# - local: model_doc/xlm +# title: XLM +# - local: model_doc/xlm-prophetnet +# title: XLM-ProphetNet +# - local: model_doc/xlm-roberta +# title: XLM-RoBERTa +# - local: model_doc/xlm-roberta-xl +# title: XLM-RoBERTa-XL +# - local: model_doc/xlm-v +# title: XLM-V +# - local: model_doc/xlnet +# title: XLNet +# - local: model_doc/yoso +# title: YOSO +# title: Text models +# - isExpanded: false +# sections: +# - local: model_doc/beit +# title: BEiT +# - local: model_doc/bit +# title: BiT +# - local: model_doc/conditional_detr +# title: Conditional DETR +# - local: model_doc/convnext +# title: ConvNeXT +# - local: model_doc/convnextv2 +# title: ConvNeXTV2 +# - local: model_doc/cvt +# title: CVT +# - local: model_doc/deformable_detr +# title: Deformable DETR +# - local: model_doc/deit +# title: DeiT +# - local: model_doc/depth_anything +# title: Depth Anything +# - local: model_doc/deta +# title: DETA +# - local: model_doc/detr +# title: DETR +# - local: model_doc/dinat +# title: DiNAT +# - local: model_doc/dinov2 +# title: DINOV2 +# - local: model_doc/dit +# title: DiT +# - local: model_doc/dpt +# title: DPT +# - local: model_doc/efficientformer +# title: EfficientFormer +# - local: model_doc/efficientnet +# title: EfficientNet +# - local: model_doc/focalnet +# title: FocalNet +# - local: model_doc/glpn +# title: GLPN +# - local: model_doc/imagegpt +# title: ImageGPT +# - local: model_doc/levit +# title: LeViT +# - local: model_doc/mask2former +# title: Mask2Former +# - local: model_doc/maskformer +# title: MaskFormer +# - local: model_doc/mobilenet_v1 +# title: MobileNetV1 +# - local: model_doc/mobilenet_v2 +# title: MobileNetV2 +# - local: model_doc/mobilevit +# title: MobileViT +# - local: model_doc/mobilevitv2 +# title: MobileViTV2 +# - local: model_doc/nat +# title: NAT +# - local: model_doc/poolformer +# title: PoolFormer +# - local: model_doc/pvt +# title: Pyramid Vision Transformer (PVT) +# - local: model_doc/pvt_v2 +# title: Pyramid Vision Transformer v2 (PVTv2) +# - local: model_doc/regnet +# title: RegNet +# - local: model_doc/resnet +# title: ResNet +# - local: model_doc/segformer +# title: SegFormer +# - local: model_doc/seggpt +# title: SegGpt +# - local: model_doc/superpoint +# title: SuperPoint +# - local: model_doc/swiftformer +# title: SwiftFormer +# - local: model_doc/swin +# title: Swin Transformer +# - local: model_doc/swinv2 +# title: Swin Transformer V2 +# - local: model_doc/swin2sr +# title: Swin2SR +# - local: model_doc/table-transformer +# title: Table Transformer +# - local: model_doc/upernet +# title: UperNet +# - local: model_doc/van +# title: VAN +# - local: model_doc/vit +# title: Vision Transformer (ViT) +# - local: model_doc/vit_hybrid +# title: ViT Hybrid +# - local: model_doc/vitdet +# title: ViTDet +# - local: model_doc/vit_mae +# title: ViTMAE +# - local: model_doc/vitmatte +# title: ViTMatte +# - local: model_doc/vit_msn +# title: ViTMSN +# - local: model_doc/yolos +# title: YOLOS +# title: Vision models +# - isExpanded: false +# sections: +# - local: model_doc/audio-spectrogram-transformer +# title: Audio Spectrogram Transformer +# - local: model_doc/bark +# title: Bark +# - local: model_doc/clap +# title: CLAP +# - local: model_doc/encodec +# title: EnCodec +# - local: model_doc/hubert +# title: Hubert +# - local: model_doc/mctct +# title: MCTCT +# - local: model_doc/mms +# title: MMS +# - local: model_doc/musicgen +# title: MusicGen +# - local: model_doc/musicgen_melody +# title: MusicGen Melody +# - local: model_doc/pop2piano +# title: Pop2Piano +# - local: model_doc/seamless_m4t +# title: Seamless-M4T +# - local: model_doc/seamless_m4t_v2 +# title: SeamlessM4T-v2 +# - local: model_doc/sew +# title: SEW +# - local: model_doc/sew-d +# title: SEW-D +# - local: model_doc/speech_to_text +# title: Speech2Text +# - local: model_doc/speech_to_text_2 +# title: Speech2Text2 +# - local: model_doc/speecht5 +# title: SpeechT5 +# - local: model_doc/unispeech +# title: UniSpeech +# - local: model_doc/unispeech-sat +# title: UniSpeech-SAT +# - local: model_doc/univnet +# title: UnivNet +# - local: model_doc/vits +# title: VITS +# - local: model_doc/wav2vec2 +# title: Wav2Vec2 +# - local: model_doc/wav2vec2-bert +# title: Wav2Vec2-BERT +# - local: model_doc/wav2vec2-conformer +# title: Wav2Vec2-Conformer +# - local: model_doc/wav2vec2_phoneme +# title: Wav2Vec2Phoneme +# - local: model_doc/wavlm +# title: WavLM +# - local: model_doc/whisper +# title: Whisper +# - local: model_doc/xls_r +# title: XLS-R +# - local: model_doc/xlsr_wav2vec2 +# title: XLSR-Wav2Vec2 +# title: Audio models +# - isExpanded: false +# sections: +# - local: model_doc/timesformer +# title: TimeSformer +# - local: model_doc/videomae +# title: VideoMAE +# - local: model_doc/vivit +# title: ViViT +# title: Video models +# - isExpanded: false +# sections: +# - local: model_doc/align +# title: ALIGN +# - local: model_doc/altclip +# title: AltCLIP +# - local: model_doc/blip +# title: BLIP +# - local: model_doc/blip-2 +# title: BLIP-2 +# - local: model_doc/bridgetower +# title: BridgeTower +# - local: model_doc/bros +# title: BROS +# - local: model_doc/chinese_clip +# title: Chinese-CLIP +# - local: model_doc/clip +# title: CLIP +# - local: model_doc/clipseg +# title: CLIPSeg +# - local: model_doc/clvp +# title: CLVP +# - local: model_doc/data2vec +# title: Data2Vec +# - local: model_doc/deplot +# title: DePlot +# - local: model_doc/donut +# title: Donut +# - local: model_doc/flava +# title: FLAVA +# - local: model_doc/git +# title: GIT +# - local: model_doc/grounding-dino +# title: Grounding DINO +# - local: model_doc/groupvit +# title: GroupViT +# - local: model_doc/idefics +# title: IDEFICS +# - local: model_doc/idefics2 +# title: Idefics2 +# - local: model_doc/instructblip +# title: InstructBLIP +# - local: model_doc/kosmos-2 +# title: KOSMOS-2 +# - local: model_doc/layoutlm +# title: LayoutLM +# - local: model_doc/layoutlmv2 +# title: LayoutLMV2 +# - local: model_doc/layoutlmv3 +# title: LayoutLMV3 +# - local: model_doc/layoutxlm +# title: LayoutXLM +# - local: model_doc/lilt +# title: LiLT +# - local: model_doc/llava +# title: Llava +# - local: model_doc/llava_next +# title: LLaVA-NeXT +# - local: model_doc/lxmert +# title: LXMERT +# - local: model_doc/matcha +# title: MatCha +# - local: model_doc/mgp-str +# title: MGP-STR +# - local: model_doc/nougat +# title: Nougat +# - local: model_doc/oneformer +# title: OneFormer +# - local: model_doc/owlvit +# title: OWL-ViT +# - local: model_doc/owlv2 +# title: OWLv2 +# - local: model_doc/paligemma +# title: PaliGemma +# - local: model_doc/perceiver +# title: Perceiver +# - local: model_doc/pix2struct +# title: Pix2Struct +# - local: model_doc/sam +# title: Segment Anything +# - local: model_doc/siglip +# title: SigLIP +# - local: model_doc/speech-encoder-decoder +# title: Speech Encoder Decoder Models +# - local: model_doc/tapas +# title: TAPAS +# - local: model_doc/trocr +# title: TrOCR +# - local: model_doc/tvlt +# title: TVLT +# - local: model_doc/tvp +# title: TVP +# - local: model_doc/udop +# title: UDOP +# - local: model_doc/video_llava +# title: VideoLlava +# - local: model_doc/vilt +# title: ViLT +# - local: model_doc/vipllava +# title: VipLlava +# - local: model_doc/vision-encoder-decoder +# title: Vision Encoder Decoder Models +# - local: model_doc/vision-text-dual-encoder +# title: Vision Text Dual Encoder +# - local: model_doc/visual_bert +# title: VisualBERT +# - local: model_doc/xclip +# title: X-CLIP +# title: Multimodal models +# - isExpanded: false +# sections: +# - local: model_doc/decision_transformer +# title: محول القرار +# - local: model_doc/trajectory_transformer +# title: محول المسار +# title: نماذج التعلم التعزيزية +# - isExpanded: false +# sections: +# - local: model_doc/autoformer +# title: Autoformer +# - local: model_doc/informer +# title: Informer +# - local: model_doc/patchtsmixer +# title: PatchTSMixer +# - local: model_doc/patchtst +# title: PatchTST +# - local: model_doc/time_series_transformer +# title: محول السلاسل الزمنية +# title: نماذج السلاسل الزمنية +# - isExpanded: false +# sections: +# - local: model_doc/graphormer +# title: Graphormer +# title: نماذج الرسم البياني +# title: النماذج +# - sections: +# - local: internal/modeling_utils +# title: الطبقات المخصصة والمرافق +# - local: internal/pipelines_utils +# title: مرافق خطوط الأنابيب +# - local: internal/tokenization_utils +# title: مرافق مقسم النصوص +# - local: internal/trainer_utils +# title: مرافق المدرب +# - local: internal/generation_utils +# title: مرافق التوليد +# - local: internal/image_processing_utils +# title: مرافق معالجة الصور +# - local: internal/audio_utils +# title: مرافق معالجة الصوت +# - local: internal/file_utils +# title: مرافق عامة +# - local: internal/time_series_utils +# title: مرافق السلاسل الزمنية +# title: مساعدون داخليون +# title: API diff --git a/docs/source/ar/accelerate.md b/docs/source/ar/accelerate.md new file mode 100644 index 00000000000000..486c1efe59af60 --- /dev/null +++ b/docs/source/ar/accelerate.md @@ -0,0 +1,120 @@ +# التدريب الموزع باستخدام 🤗 Accelerate + + +مع تزايد حجم النماذج اللغوية، برز التوازي كأحد الاستراتيجيات لتدريب نماذج أكبر على أجهزة محدودة وتسريع عملية التدريب بمقدار كبير. أنشأنا في Hugging Face، قمنا بإنشاء مكتبة [ Accelerate](https://huggingface.co/docs/accelerate) لمساعدة المستخدمين على تدريب أي نموذج من Transformers بسهولة على أي نوع من الإعدادات الموزعة، سواء كان ذلك على عدة وحدات معالجة رسومات (GPUs) على جهاز واحد أو على عدة وحدات معالجة رسومات موزعة على عدة أجهزة. في هذا الدليل، تعلم كيفية تخصيص حلقة تدريب PyTorch الأصلية لتمكين التدريب في بيئة موزعة. + +## الإعداد + +ابدأ بتثبيت 🤗 Accelerate: + +```bash +pip install accelerate +``` + +ثم قم باستيراد وإنشاء كائن [`~accelerate.Accelerator`]. سيقوم [`~accelerate.Accelerator`] تلقائيًا باكتشاف نوع الإعداد الموزع الخاص بك وتهيئة جميع المكونات اللازمة للتدريب. لن تحتاج إلى وضع نموذجك على جهاز بشكل معين. + +```py +>>> from accelerate import Accelerator + +>>> accelerator = Accelerator() +``` + +## الاستعداد للتسريع + +الخطوة التالية هي تمرير جميع كائنات التدريب ذات الصلة إلى دالة الإعداد [`~accelerate.Accelerator.prepare`]. ويشمل ذلك DataLoaders للتدريب والتقييم، ونموذجًا ومُحَسِّنً المعاملات (optimizer): + +```py +>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( +... train_dataloader, eval_dataloader, model, optimizer +... ) +``` + +## الخلفي Backward + +الإضافة الأخيرة هي استبدال الدالة المعتادة `loss.backward()` في حلقة التدريب الخاصة بك بدالة [`~accelerate.Accelerator.backward`] في 🤗 Accelerate: + +```py +>>> for epoch in range(num_epochs): +... for batch in train_dataloader: +... outputs = model(**batch) +... loss = outputs.loss +... accelerator.backward(loss) + +... optimizer.step() +... lr_scheduler.step() +... optimizer.zero_grad() +... progress_bar.update(1) +``` + +كما يمكنك أن ترى في الكود التالي، فأنت بحاجة فقط إلى إضافة أربعة أسطر من الكود إلى حلقة التدريب الخاصة بك لتمكين التدريب الموزع! + +```diff ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + ++ accelerator = Accelerator() + + model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +- model.to(device) + ++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( ++ train_dataloader, eval_dataloader, model, optimizer ++ ) + + num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) + + progress_bar = tqdm(range(num_training_steps)) + + model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: +- batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss +- loss.backward() ++ accelerator.backward(loss) +optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) +``` + +## تدريب + +بمجرد إضافة أسطر الكود ذات الصلة، قم بتشغيل التدريب الخاص بك في أحد النصوص أو الدفاتر مثل Colaboratory. + +### التدريب باستخدام نص برمجي + +إذا كنت تشغل التدريب الخاص بك من نص برمجي، فقم بتشغيل الأمر التالي لإنشاء وحفظ ملف تكوين: + +```bash +accelerate config +``` + +ثم قم بتشغيل التدريب الخاص بك باستخدام: + +```bash +accelerate launch train.py +``` + +### التدريب باستخدام دفتر ملاحظات + +يمكن أيضًا تشغيل 🤗 Accelerate في دفاتر إذا كنت تخطط لاستخدام وحدات معالجة الرسوميات (TPUs) في Colaboratory. قم بتغليف كل الكود المسؤول عن التدريب في دالة، ومررها إلى [`~accelerate.notebook_launcher`]: + +```py +>>> from accelerate import notebook_launcher + +>>> notebook_launcher(training_function) +``` + +للحصول على مزيد من المعلومات حول 🤗 Accelerate وميزاته الغنية، يرجى الرجوع إلى [الوثائق](https://huggingface.co/docs/accelerate). \ No newline at end of file diff --git a/docs/source/ar/agents.md b/docs/source/ar/agents.md new file mode 100644 index 00000000000000..92b2a4715f6f07 --- /dev/null +++ b/docs/source/ar/agents.md @@ -0,0 +1,539 @@ +# الوكلاء والأدوات + +[[open-in-colab]] + +### ما هو الوكيل؟ + +يمكن للنظم اللغوية الكبيرة (LLMs) التي تم تدريبها على أداء [نمذجة اللغة السببية](./tasks/language_modeling.) التعامل مع مجموعة واسعة من المهام، ولكنها غالبًا ما تواجه صعوبات في المهام الأساسية مثل المنطق والحساب والبحث. وعندما يتم استدعاؤها في مجالات لا تؤدي فيها أداءً جيدًا، فإنها غالبًا ما تفشل في توليد الإجابة التي نتوقعها منها. + +يتمثل أحد النهج للتغلب على هذا القصور في إنشاء "وكيل". + +الوكيل هو نظام يستخدم LLM كمحرك له، ولديه حق الوصول إلى وظائف تسمى "أدوات". + +هذه "الأدوات" هي وظائف لأداء مهمة، وتحتوي على جميع الأوصاف اللازمة للوكيل لاستخدامها بشكل صحيح. + +يمكن برمجة الوكيل للقيام بما يلي: +- وضع سلسلة من الإجراءات/الأدوات وتشغيلها جميعًا في نفس الوقت مثل [`CodeAgent`] على سبيل المثال +- التخطيط للاجراءات/الأدوات وتنفيذها واحدة تلو الأخرى والانتظار حتى انتهاء كل إجراء قبل إطلاق التالي مثل [`ReactJsonAgent`] على سبيل المثال + +### أنواع الوكلاء + +#### الوكيل البرمجي (Code agent) + +يتمتع هذا الوكيل يتبع خطوات محددة: أولًا، يخطط لسلسلة من الإجراءات التي يريد تنفيذها، ثم شفرة Python لتنفيذ جميع الإجراءات في نفس الوقت. وهو يتعامل بشكل أصلي مع أنواع مختلفة من المدخلات والمخرجات للأدوات التي يستخدمها، وبالتالي فهو الخيار الموصى به للمهام متعددة الوسائط. + +#### وكلاء التفاعل + +هذا هو الوكيل الذي يتم اللجوء إليه لحل مهام الاستدلال، حيث يجعل إطار ReAct ([Yao et al.، 2022](https://huggingface.co/papers/2210.03629)) من الكفاءة حقًا التفكير على أساس ملاحظاته السابقة. + +نقوم بتنفيذ إصدارين من ReactJsonAgent: +- [`ReactJsonAgent`] يقوم بتوليد استدعاءات الأدوات كـ JSON في إخراجها. +- [`ReactCodeAgent`] هو نوع جديد من ReactJsonAgent يقوم بتوليد استدعاءات أدواته كمقاطع من التعليمات البرمجية، والتي تعمل بشكل جيد حقًا مع LLMs التي تتمتع بأداء قوي في البرمجة. + +> [!TIP] +> اقرأ منشور المدونة [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) لمعرفة المزيد عن وكيل ReAct. + +![إطار عمل وكيل ReAct](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png) + +على سبيل المثال، إليك كيف يعمل وكيل ReAct Code طريقه من خلال السؤال التالي. + +```py3 +>>> agent.run( +... "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?", +... ) +=====New task===== +How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need? +====Agent is executing the code below: +bert_blocks = search(query="number of blocks in BERT base encoder") +print("BERT blocks:", bert_blocks) +==== +Print outputs: +BERT blocks: twelve encoder blocks + +====Agent is executing the code below: +attention_layer = search(query="number of layers in Attention is All You Need") +print("Attention layers:", attention_layer) +==== +Print outputs: +Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture. + +====Agent is executing the code below: +bert_blocks = 12 +attention_layers = 6 +diff = bert_blocks - attention_layers +print("Difference in blocks:", diff) +final_answer(diff) +==== + +Print outputs: +Difference in blocks: 6 + +Final answer: 6 +``` + +### كيف يمكنني بناء وكيل؟ + +لتهيئة وكيل، تحتاج إلى هذه الوسائط: + +- نموذج لغوي كبير (LLM) يشكل المحرك الأساسي للوكيل. الوكيل نفسه ليس النموذج اللغوي، بل هو برنامج يستخدم النموذج اللغوي كمحرك له. +- موجه النظام (system prompt): هذه هي التعليمات التي يتم إعطاؤها للنموذج اللغوي لإنشاء مخرجاته. +- صندوق أدوات (toolbox) يختار الوكيل منه الأدوات لتنفيذها +- محلل (parser) لاستخراج الأدوات التي يجب استدعاؤها من مخرجات النموذج اللغوي LLM والأدوات التي يجب استخدامها + +عند تهيئة نظام الوكيل، يتم استخدام سمات الأداة لإنشاء وصف للأداة، ثم يتم دمجها في موجه النظام الخاص `system_prompt` للوكيل لإعلامه بالأدوات التي يمكنه استخدامها ولماذا. + +للبدء، يرجى تثبيت `agents` الإضافية لتثبيت جميع التبعيات الافتراضية. + +```bash +pip install transformers[agents] +``` + +قم ببناء محرك LLM الخاص بك من خلال تعريف طريقة `llm_engine` التي تقبل قائمة من [الرسائل](./chat_templating.) وتعيد النص. يجب أن تقبل هذه الدالة القابلة للاستدعاء أيضًا معامل `stop` يشير إلى متى يجب التوقف عن التوليد. + +```python +from huggingface_hub import login, InferenceClient + +login("") + +client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct") + +def llm_engine(messages, stop_sequences=["Task"]) -> str: + response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000) + answer = response.choices[0].message.content + return answer +``` + +يمكنك استخدام أي طريقة `llm_engine` طالما أنها: +1. يتبع تنسيق [رسائل](./chat_templating.md) لإدخاله (`List [Dict [str، str]]`) ويعيد `str` +2. يتوقف عن توليد المخراجات من التسلسلات التي تم تمريرها في معامل `stop` + +أنت بحاجة أيضًا إلى معامل "الأدوات" الذي يقبل قائمة من "الأدوات". يمكنك توفير قائمة فارغة لـ "الأدوات"، ولكن استخدم صندوق الأدوات الافتراضي مع معامل اختياري `add_base_tools=True`. + +الآن يمكنك إنشاء وكيل، مثل [`CodeAgent`], وتشغيله. ولتسهيل الأمر، نقدم أيضًا فئة [`HfEngine`] التي تستخدم `huggingface_hub.InferenceClient` بشكل مخفى. + +```python +from transformers import CodeAgent, HfEngine + +llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct") +agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) + +agent.run( + "Could you translate this sentence from French, say it out loud and return the audio.", + sentence="Où est la boulangerie la plus proche?", +) +``` + +هذه الميزة ستكون مفيدة في حالة الحاجة الملحة! يمكنك حتى ترك معامل `llm_engine` غير محدد، وسيتم إنشاء [`HfEngine`] بشكل تلقائي. + +```python +from transformers import CodeAgent + +agent = CodeAgent(tools=[], add_base_tools=True) + +agent.run( + "Could you translate this sentence from French, say it out loud and give me the audio.", + sentence="Où est la boulangerie la plus proche?", +) +``` + +لاحظ أننا استخدمنا معامل "sentence" إضافي: يمكنك تمرير النص كمعامل إضافي إلى النموذج. + +يمكنك أيضًا استخدام هذا للإشارة إلى مسار الملفات المحلية أو البعيدة للنموذج لاستخدامها: + +```py +from transformers import ReactCodeAgent + +agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) + +agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3") +``` + + +تم تحديد موجه النظام ومحلل المخرجات تلقائيًا، ولكن يمكنك فحصهما بسهولة عن طريق استدعاء `system_prompt_template` على وكيلك. + +```python +print(agent.system_prompt_template) +``` + +من المهم أن تشرح بأكبر قدر ممكن من الوضوح المهمة التي تريد تنفيذها. +كل عملية [`~Agent.run`] مستقلة، وبما أن الوكيل مدعوم من LLM، فقد تؤدي الاختلافات الطفيفة في موجهك إلى نتائج مختلفة تمامًا. +يمكنك أيضًا تشغيل وكيل بشكل متتالي لمهام مختلفة: في كل مرة يتم فيها إعادة تهيئة سمتي `agent.task` و`agent.logs`. + + +#### تنفيذ التعليمات البرمجية + +يقوم مفسر Python بتنفيذ التعليمات البرمجية على مجموعة من المدخلات التي يتم تمريرها جنبًا إلى جنب مع أدواتك. +يجب أن يكون هذا الأمر آمنًا لأن الوظائف الوحيدة التي يمكن استدعاؤها هي الأدوات التي قدمتها (خاصة إذا كانت أدوات من Hugging Face فقط) ووظيفة الطباعة، لذا فأنت مقيد بالفعل بما يمكن تنفيذه. + +مفسر Python لا يسمح أيضًا باستدعاء دوال بشكل افتراضي خارج قائمة آمنة، لذا فإن جميع الهجمات الأكثر وضوحًا لا ينبغي أن تكون مشكلة. +يمكنك أيضًا الإذن باستيرادات إضافية عن طريق تمرير الوحدات النمطية المصرح بها كقائمة من السلاسل في معامل `additional_authorized_imports` عند تهيئة [`ReactCodeAgent`] أو [`CodeAgent`]: + +```py +>>> from transformers import ReactCodeAgent + +>>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4']) +>>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?") + +(...) +'Hugging Face – Blog' +``` + +سيتم إيقاف التنفيذ عند أي رمز يحاول تنفيذ عملية غير قانونية أو إذا كان هناك خطأ Python عادي في التعليمات البرمجية التي تم إنشاؤها بواسطة الوكيل. + +> [!WARNING] +> يمكن لـ LLM توليد شفرة برمجية عشوائية سيتم تنفيذها بعد ذلك: لا تقمب استدعاء أى دوال غير آمنة! + +### موجه النظام + +ينشئ الوكيل، أو بالأحرى LLM الذي يقود الوكيل، يولد مخرجات بناءً على موجه النظام. يمكن تخصيص موجه النظام وتصميمه للمهام المقصودة. على سبيل المثال، تحقق من موجه النظام لـ [`ReactCodeAgent`] (الإصدار أدناه مبسط قليلاً). + +```text +You will be given a task to solve as best you can. +You have access to the following tools: +<> + +To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. + +At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use. +Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence. +During each intermediate step, you can use 'print()' to save whatever important information you will then need. +These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step. + +In the end you have to return a final answer using the `final_answer` tool. + +Here are a few examples using notional tools: +--- +{examples} + +Above example were using notional tools that might not exist for you. You only have acces to those tools: +<> +You also can perform computations in the python code you generate. + +Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```' sequence. You MUST provide at least the 'Code:' sequence to move forward. + +Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks. +Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result. + +Remember to make sure that variables you use are all defined. + +Now Begin! +``` + +يتضمن موجه النظام: +- *مقدمة* تشرح كيف يجب أن يتصرف الوكيل والأدوات التي يجب عليه استخدامها. +- وصف لجميع الأدوات التي يتم تحديدها بواسطة رمز `<>` الذي يتم استبداله ديناميكيًا في وقت التشغيل بالأدوات التي يحددها المستخدم أو يختارها. + - يأتي وصف الأداة من سمات الأداة، `name`، و`description`، و`inputs` و`output_type`، وقالب `jinja2` بسيط يمكنك تحسينه. +- شكل المخرج المتوقع. + +يمكنك تحسين موجه النظام، على سبيل المثال، عن طريق إضافة شرح لتنسيق المخرجات. + +للحصول على أقصى قدر من المرونة، يمكنك الكتابة فوق قالب موجه النظام بالكامل عن طريق تمرير موجه مخصص كمعامل إلى معلمة `system_prompt`. + +```python +from transformers import ReactJsonAgent +from transformers.agents import PythonInterpreterTool + +agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}") +``` + +> [!WARNING] +> يرجى التأكد من تحديد سلسلة `<>` في مكان ما في `template` حتى يكون الوكيل على علم +بالأدوات المتاحة. + + +### فحص تشغيل الوكيل + +فيما يلي بعض السمات المفيدة لفحص ما حدث بعد التشغيل: +- تخزن `agent.logs` سجلات مفصلة للوكيل. في كل خطوة من تشغيل الوكيل، يتم تخزين كل شيء في قاموس إلحاقه بـ `agent.logs`. +- تشغيل `agent.write_inner_memory_from_logs()` يخلق ذاكرة داخلية لسجلات الوكيل للنظام LLM لعرضها، كقائمة من رسائل الدردشة. تنتقل هذه الطريقة عبر كل خطوة من سجل الوكيل ولا تخزن سوى ما يهمها كرسالة: على سبيل المثال، سيحفظ موجه النظام والمهمة في رسائل منفصلة، ثم لكل خطوة سيخزن مخرج LLM كرسالة، ومخرج استدعاء الأداة كرسالة أخرى. استخدم هذا إذا كنت تريد عرضًا عامًا لما حدث - ولكن لن يتم نسخ كل سجل بواسطة هذه الطريقة. + +## الأدوات + +الأداة هي عبارة عن وظيفة أساسية يستخدمها الوكيل لتنفيذ مهمة محددة. + +يمكنك على سبيل المثال التحقق من [`PythonInterpreterTool`]: لديه اسم ووصف ووصف للمدخلات ونوع للمخرج، وطريقة `__call__` التي تقوم بتنفيذ المهمة المطلوبة. + +عند تهيئة الوكيل، يتم استخدام سمات الأداة لتوليد وصف للأداة يتم تضمينه في موجه النظام الخاص بالوكيل. يتيح هذا للوكيل معرفة الأدوات التي يمكنه استخدامها ولماذا. + +### صندوق الأدوات الافتراضي + +يأتي Transformers مع صندوق أدوات افتراضي لتمكين الوكلاء، والذي يمكنك إضافته إلى وكيلك عند التهيئة باستخدام معامل `add_base_tools = True`: + +- **الإجابة على أسئلة المستند**: الإجابة على سؤال حول المستند (مثل ملف PDF) بتنسيق صورة ([Donut](./model_doc/donut)) +- **الإجابة على أسئلة الصور**: الإجابة على سؤال حول صورة ([VILT](./model_doc/vilt)) +- **التحدث إلى النص**: قم بتفريغ الكلام إلى نص ([Whisper](./model_doc/whisper)) +- **النص إلى كلام**: تحويل النص إلى كلام ([SpeechT5](./model_doc/speecht5)) +- **الترجمة**: ترجمة جملة معينة من لغة المصدر إلى لغة الهدف. +- **مفسر كود Python**: تشغيل كود Python الذي تم إنشاؤه بواسطة LLM في بيئة آمنة. لن يتم إضافة هذه الأداة إلى [`ReactJsonAgent`] إلا إذا استخدمت `add_base_tools=True`، نظرًا لأن الأدوات المستندة إلى التعليمات البرمجية يمكنها بالفعل تنفيذ كود Python +لا تترجم النصوص الخاصة ولا الأكواد البرمجية ولا الروابط ولا رموز HTML وCSS: + +يمكنك استخدام أداة يدويًا عن طريق استدعاء دالة [`load_tool`] وتحديد مهمة لتنفيذها. + +```python +from transformers import load_tool + +tool = load_tool("text-to-speech") +audio = tool("This is a text to speech tool") +``` + +### إنشاء أداة جديدة + +يمكنك إنشاء أداتك الخاصة لتغطية حالات الاستخدام التي لا تغطيها الأدوات الافتراضية من Hugging Face. +على سبيل المثال، دعنا نقوم بإنشاء أداة تعرض النموذج الأكثر تنزيلًا لمهمة معينة من Hub. + +سوف نبدأ بالكود التالي. + +```python +from huggingface_hub import list_models + +task = "text-classification" + +model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) +print(model.id) +``` + +يمكن تحويل هذه الشيفرة إلى فئة ترث من الفئة العليا [`Tool`]. + +تحتاج الأداة المخصصة إلى: + +- اسم `name`، والتي تمثل اسم الأداة نفسها. عادةً ما يصف الاسم وظيفتها. بما أن الكود يعيد النموذج الأكثر تنزيلًا لمهمة ما، فلنسمها `model_download_counter`. +- تستخدم خاصية `description` لملء موجه نظام الوكيل. +- خاصية `inputs`، والتي هي عبارة عن قاموس بمفاتيح "type" و"description". يحتوي على معلومات تساعد المفسر Python على اتخاذ خيارات مستنيرة بشأن المدخلات. +- خاصية `output_type`، والتي تحدد نوع المخرج. +- طريقة `forward` والتي تحتوي على الكود الذي سيتم تنفيذه للحصول على النتيجة النهائية. + +```python +from transformers import Tool +from huggingface_hub import list_models + +class HFModelDownloadsTool(Tool): + name = "model_download_counter" + description = ( + "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. " + "It returns the name of the checkpoint." + ) + + inputs = { + "task": { + "type": "text", + "description": "the task category (such as text-classification, depth-estimation, etc)", + } + } + output_type = "text" + + def forward(self, task: str): + model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) + return model.id +``` + +الآن بعد أن أصبحت فئة `HfModelDownloadsTool` المخصصة جاهزة، يمكنك حفظها في ملف باسم `model_downloads.py` واستيرادها للاستخدام. + +```python +from model_downloads import HFModelDownloadsTool + +tool = HFModelDownloadsTool() +``` + +يمكنك أيضًا مشاركة أداتك المخصصة في Hub عن طريق استدعاء [`~Tool.push_to_hub`] على الأداة. تأكد من أنك قمت بإنشاء مستودع لها على Hub وأنك تستخدم رمز وصول للقراءة. + +```python +tool.push_to_hub("{your_username}/hf-model-downloads") +``` + +قم بتحميل الأداة باستخدام دالة [`~Tool.load_tool`] ومررها إلى معلمة `tools` في الوكيل الخاص بك. + +```python +from transformers import load_tool, CodeAgent + +model_download_tool = load_tool("m-ric/hf-model-downloads") +agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine) +agent.run( + "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" +) +``` + +ستحصل على ما يلي: + +```text +======== New task ======== +Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub? +==== Agent is executing the code below: +most_downloaded_model = model_download_counter(task="text-to-video") +print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.") +==== +``` + +والناتج: + +`"النموذج الأكثر تنزيلًا لمهمة `text-to-video` هو ByteDance/AnimateDiff-Lightning."` + +### إدارة صندوق أدوات الوكيل الخاص بك + +إذا كنت قد قمت بتهيئة وكيل، فمن غير الملائم إعادة تهيئته من البداية لإضافة أداة جديدة ترغب في استخدامها. باستخدام مكتبة Transformers، يمكنك إدارة صندوق أدوات الوكيل بإضافة أو استبدال أداة موجودة. + +دعنا نضيف الأداة `model_download_tool` إلى وكيل تم تهيئته مسبقًا باستخدام صندوق الأدوات الافتراضي. + +```python +from transformers import CodeAgent + +agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) +agent.toolbox.add_tool(model_download_tool) +``` + +الآن يمكننا الاستفادة من الأداة الجديدة وأداة تحويل النص إلى كلام السابقة: + +```python + agent.run( + "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?" + ) +``` + +| **Audio** | +|------------------------------------------------------------------------------------------------------------------------------------------------------| +|