From 4b0418df11886547e2c701cc4504627881397a0b Mon Sep 17 00:00:00 2001
From: Raushan Turganbay
Date: Fri, 13 Sep 2024 12:58:38 +0200
Subject: [PATCH 01/67] Enable `padding_side` as call time kwargs (#33385)
* fix
* add padding-side kwarg
* add padding side in all models & fix tests
* fix copies
* fix tests
---
.../layoutlmv2/tokenization_layoutlmv2.py | 29 +++++++++--
.../tokenization_layoutlmv2_fast.py | 22 +++++++--
.../layoutlmv3/tokenization_layoutlmv3.py | 29 +++++++++--
.../tokenization_layoutlmv3_fast.py | 22 +++++++--
.../layoutxlm/tokenization_layoutxlm.py | 23 +++++++--
.../layoutxlm/tokenization_layoutxlm_fast.py | 18 +++++--
.../models/led/tokenization_led.py | 2 +
.../models/led/tokenization_led_fast.py | 2 +
.../models/luke/tokenization_luke.py | 29 +++++++++--
.../models/markuplm/tokenization_markuplm.py | 29 +++++++++--
.../markuplm/tokenization_markuplm_fast.py | 22 +++++++--
.../models/mluke/tokenization_mluke.py | 29 +++++++++--
.../models/roc_bert/tokenization_roc_bert.py | 17 +++++--
.../models/tapas/tokenization_tapas.py | 27 ++++++++--
.../models/udop/tokenization_udop.py | 27 ++++++++--
.../models/udop/tokenization_udop_fast.py | 22 +++++++--
.../models/wav2vec2/tokenization_wav2vec2.py | 6 +++
src/transformers/tokenization_utils.py | 7 +++
src/transformers/tokenization_utils_base.py | 37 ++++++++++++--
src/transformers/tokenization_utils_fast.py | 10 +++-
.../test_tokenization_layoutlmv2.py | 44 ++++++++++-------
.../test_tokenization_layoutlmv3.py | 44 ++++++++++-------
.../layoutxlm/test_tokenization_layoutxlm.py | 44 ++++++++++-------
.../markuplm/test_tokenization_markuplm.py | 44 ++++++++++-------
tests/models/tapas/test_tokenization_tapas.py | 43 +++++++++-------
tests/test_tokenization_common.py | 49 ++++++++++++-------
26 files changed, 528 insertions(+), 149 deletions(-)
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index fe0305562374d7..c5ec79666deede 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -414,6 +414,7 @@ def __call__(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -517,6 +518,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -539,6 +541,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -567,6 +570,7 @@ def batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -598,6 +602,7 @@ def batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -625,6 +630,7 @@ def _batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -653,6 +659,7 @@ def _batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -677,6 +684,7 @@ def _batch_prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -708,6 +716,7 @@ def _batch_prepare_for_model(
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
+ padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -728,6 +737,7 @@ def _batch_prepare_for_model(
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -748,6 +758,7 @@ def encode(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -769,6 +780,7 @@ def encode(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -795,6 +807,7 @@ def encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -838,6 +851,7 @@ def encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -861,6 +875,7 @@ def _encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -891,6 +906,7 @@ def _encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -914,6 +930,7 @@ def prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1100,6 +1117,7 @@ def prepare_for_model(
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1243,6 +1261,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1265,6 +1284,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1288,7 +1310,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1302,7 +1325,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1317,7 +1340,7 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
index aa2bf6b3226b18..a666e3d4ea1a43 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
@@ -165,6 +165,7 @@ def __call__(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -268,6 +269,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -290,6 +292,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -318,6 +321,7 @@ def batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -349,6 +353,7 @@ def batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -381,6 +386,7 @@ def encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -424,6 +430,7 @@ def encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -451,6 +458,7 @@ def _batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -470,6 +478,7 @@ def _batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
)
if is_pair:
@@ -603,6 +612,7 @@ def _encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -631,6 +641,7 @@ def _encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -663,6 +674,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -685,6 +697,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -708,7 +723,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -722,7 +738,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -737,7 +753,7 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
index 89f899f22f4ecc..248a299c141fd5 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -543,6 +543,7 @@ def __call__(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -646,6 +647,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -668,6 +670,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -697,6 +700,7 @@ def batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -728,6 +732,7 @@ def batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -756,6 +761,7 @@ def _batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -784,6 +790,7 @@ def _batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -809,6 +816,7 @@ def _batch_prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -840,6 +848,7 @@ def _batch_prepare_for_model(
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
+ padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -860,6 +869,7 @@ def _batch_prepare_for_model(
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -881,6 +891,7 @@ def encode(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -902,6 +913,7 @@ def encode(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -929,6 +941,7 @@ def encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -972,6 +985,7 @@ def encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -996,6 +1010,7 @@ def _encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1026,6 +1041,7 @@ def _encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -1049,6 +1065,7 @@ def prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1237,6 +1254,7 @@ def prepare_for_model(
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1382,6 +1400,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1404,6 +1423,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1427,7 +1449,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1441,7 +1464,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1456,6 +1479,6 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
index 07bedf36133ad8..63cd1022e52170 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -217,6 +217,7 @@ def __call__(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -320,6 +321,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -342,6 +344,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -371,6 +374,7 @@ def batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -402,6 +406,7 @@ def batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -436,6 +441,7 @@ def encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -479,6 +485,7 @@ def encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -506,6 +513,7 @@ def _batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -525,6 +533,7 @@ def _batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
)
if is_pair:
@@ -664,6 +673,7 @@ def _encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -692,6 +702,7 @@ def _encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -725,6 +736,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -747,6 +759,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -770,7 +785,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -784,7 +800,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -799,7 +815,7 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index 3ab57ac892aa73..248f16af8441c1 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -447,6 +447,7 @@ def __call__(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -550,6 +551,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -572,6 +574,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -599,6 +602,7 @@ def _batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -627,6 +631,7 @@ def _batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -651,6 +656,7 @@ def _batch_prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -682,6 +688,7 @@ def _batch_prepare_for_model(
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
+ padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -702,6 +709,7 @@ def _batch_prepare_for_model(
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -721,6 +729,7 @@ def _encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -751,6 +760,7 @@ def _encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -774,6 +784,7 @@ def prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -947,6 +958,7 @@ def prepare_for_model(
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1090,6 +1102,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1112,6 +1125,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side (`str`, *optional*):
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1135,7 +1151,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1149,7 +1166,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1164,6 +1181,6 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index 6d68cb9f18e7d6..7d12cec496ea30 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -277,6 +277,7 @@ def __call__(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -380,6 +381,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -402,6 +404,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -442,6 +445,7 @@ def _batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -462,6 +466,7 @@ def _batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
)
if is_pair:
@@ -595,6 +600,7 @@ def _encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -623,6 +629,7 @@ def _encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -655,6 +662,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -677,6 +685,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side (`str`, *optional*):
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -700,7 +711,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -714,7 +726,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -729,7 +741,7 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py
index aaf09e6d149eb1..6c1ec9526aefbf 100644
--- a/src/transformers/models/led/tokenization_led.py
+++ b/src/transformers/models/led/tokenization_led.py
@@ -412,6 +412,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
encoded_inputs = super()._pad(
@@ -419,6 +420,7 @@ def _pad(
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
index ca15eb997bed5b..6ee69fbe792752 100644
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ b/src/transformers/models/led/tokenization_led_fast.py
@@ -288,6 +288,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
encoded_inputs = super()._pad(
@@ -295,6 +296,7 @@ def _pad(
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 1a570992ffb406..e06b9c753fe596 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -570,6 +570,7 @@ def __call__(
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -662,6 +663,7 @@ def __call__(
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -688,6 +690,7 @@ def __call__(
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -715,6 +718,7 @@ def _encode_plus(
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -769,6 +773,7 @@ def _encode_plus(
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -796,6 +801,7 @@ def _batch_encode_plus(
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -876,6 +882,7 @@ def _batch_encode_plus(
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -1070,6 +1077,7 @@ def _batch_prepare_for_model(
max_entity_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1112,6 +1120,7 @@ def _batch_prepare_for_model(
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
+ padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -1132,6 +1141,7 @@ def _batch_prepare_for_model(
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1155,6 +1165,7 @@ def prepare_for_model(
max_entity_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1357,6 +1368,7 @@ def prepare_for_model(
max_entity_length=max_entity_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1382,6 +1394,7 @@ def pad(
max_length: Optional[int] = None,
max_entity_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
@@ -1418,6 +1431,9 @@ def pad(
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention
@@ -1495,6 +1511,7 @@ def pad(
max_entity_length=max_entity_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
@@ -1519,6 +1536,7 @@ def pad(
max_entity_length=max_entity_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1536,6 +1554,7 @@ def _pad(
max_entity_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1562,6 +1581,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1600,9 +1622,10 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(encoded_inputs["input_ids"])
+ padding_side = padding_side if padding_side is not None else self.padding_side
if entities_provided:
entity_difference = max_entity_length - len(encoded_inputs["entity_ids"])
- if self.padding_side == "right":
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if entities_provided:
@@ -1633,7 +1656,7 @@ def _pad(
encoded_inputs["entity_end_positions"] + [0] * entity_difference
)
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if entities_provided:
@@ -1664,7 +1687,7 @@ def _pad(
"entity_end_positions"
]
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py
index c77865abc934c9..e5de1e4e765c93 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm.py
@@ -503,6 +503,7 @@ def __call__(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -602,6 +603,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -624,6 +626,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -652,6 +655,7 @@ def batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -683,6 +687,7 @@ def batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -710,6 +715,7 @@ def _batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -738,6 +744,7 @@ def _batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -762,6 +769,7 @@ def _batch_prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -793,6 +801,7 @@ def _batch_prepare_for_model(
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
+ padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -813,6 +822,7 @@ def _batch_prepare_for_model(
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -833,6 +843,7 @@ def encode(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -854,6 +865,7 @@ def encode(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -880,6 +892,7 @@ def encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -923,6 +936,7 @@ def encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -946,6 +960,7 @@ def _encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -976,6 +991,7 @@ def _encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -999,6 +1015,7 @@ def prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1203,6 +1220,7 @@ def prepare_for_model(
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1357,6 +1375,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1376,6 +1395,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1399,7 +1421,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1419,7 +1442,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1440,6 +1463,6 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
index ff0e4ffeb56e9f..796459876425b4 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
@@ -286,6 +286,7 @@ def __call__(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -385,6 +386,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -407,6 +409,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -435,6 +438,7 @@ def batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -466,6 +470,7 @@ def batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -498,6 +503,7 @@ def encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -541,6 +547,7 @@ def encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -568,6 +575,7 @@ def _batch_encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -587,6 +595,7 @@ def _batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
)
if is_pair:
@@ -721,6 +730,7 @@ def _encode_plus(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -749,6 +759,7 @@ def _encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -781,6 +792,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -800,6 +812,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -823,7 +838,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -843,7 +859,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -864,7 +880,7 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index 3ac8191402af90..f087c0d92fc63f 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -399,6 +399,7 @@ def __call__(
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -491,6 +492,7 @@ def __call__(
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -517,6 +519,7 @@ def __call__(
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -545,6 +548,7 @@ def _encode_plus(
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -599,6 +603,7 @@ def _encode_plus(
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -627,6 +632,7 @@ def _batch_encode_plus(
stride: int = 0,
is_split_into_words: Optional[bool] = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -707,6 +713,7 @@ def _batch_encode_plus(
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -904,6 +911,7 @@ def _batch_prepare_for_model(
max_entity_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -946,6 +954,7 @@ def _batch_prepare_for_model(
max_entity_length=max_entity_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
+ padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -966,6 +975,7 @@ def _batch_prepare_for_model(
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -990,6 +1000,7 @@ def prepare_for_model(
max_entity_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1192,6 +1203,7 @@ def prepare_for_model(
max_entity_length=max_entity_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1218,6 +1230,7 @@ def pad(
max_length: Optional[int] = None,
max_entity_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
@@ -1254,6 +1267,9 @@ def pad(
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention
@@ -1331,6 +1347,7 @@ def pad(
max_entity_length=max_entity_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
@@ -1355,6 +1372,7 @@ def pad(
max_entity_length=max_entity_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1373,6 +1391,7 @@ def _pad(
max_entity_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1399,6 +1418,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1437,9 +1459,10 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(encoded_inputs["input_ids"])
+ padding_side = padding_side if padding_side is not None else self.padding_side
if entities_provided:
entity_difference = max_entity_length - len(encoded_inputs["entity_ids"])
- if self.padding_side == "right":
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if entities_provided:
@@ -1470,7 +1493,7 @@ def _pad(
encoded_inputs["entity_end_positions"] + [0] * entity_difference
)
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if entities_provided:
@@ -1501,7 +1524,7 @@ def _pad(
"entity_end_positions"
]
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index eaf2a1a491335d..3a980c0ae66f68 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -210,6 +210,7 @@ def _encode_plus(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -283,6 +284,7 @@ def get_input_ids(text):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -308,6 +310,7 @@ def prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -462,6 +465,7 @@ def prepare_for_model(
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -480,6 +484,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
# Load from model defaults
@@ -502,8 +507,9 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
+ padding_side = padding_side if padding_side is not None else self.padding_side
- if self.padding_side == "right":
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -516,7 +522,7 @@ def _pad(
if key in encoded_inputs:
encoded_inputs[key] = encoded_inputs[key] + [self.pad_token_id] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -530,7 +536,7 @@ def _pad(
encoded_inputs[key] = [self.pad_token_id] * difference + encoded_inputs[key]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
@@ -551,6 +557,7 @@ def _batch_encode_plus(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -627,6 +634,7 @@ def get_input_ids(text):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -650,6 +658,7 @@ def _batch_prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -686,6 +695,7 @@ def _batch_prepare_for_model(
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
+ padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -706,6 +716,7 @@ def _batch_prepare_for_model(
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 2da9fe40c1ce88..867e53ff89078a 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -517,6 +517,7 @@ def __call__(
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -581,6 +582,7 @@ def __call__(
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -602,6 +604,7 @@ def __call__(
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -631,6 +634,7 @@ def batch_encode_plus(
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -699,6 +703,7 @@ def batch_encode_plus(
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -738,6 +743,7 @@ def _batch_encode_plus(
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = True,
return_attention_mask: Optional[bool] = None,
@@ -768,6 +774,7 @@ def _batch_encode_plus(
add_special_tokens=add_special_tokens,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -797,6 +804,7 @@ def _batch_prepare_for_model(
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = True,
return_attention_mask: Optional[bool] = True,
@@ -823,6 +831,7 @@ def _batch_prepare_for_model(
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=None, # we pad in batch afterwards
+ padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterwards
return_token_type_ids=return_token_type_ids,
return_special_tokens_mask=return_special_tokens_mask,
@@ -844,6 +853,7 @@ def _batch_prepare_for_model(
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -912,6 +922,7 @@ def encode_plus(
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -968,6 +979,7 @@ def encode_plus(
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -993,6 +1005,7 @@ def _encode_plus(
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = True,
return_attention_mask: Optional[bool] = True,
@@ -1024,6 +1037,7 @@ def _encode_plus(
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -1051,6 +1065,7 @@ def prepare_for_model(
truncation: Union[bool, str, TapasTruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = True,
return_attention_mask: Optional[bool] = True,
@@ -1214,6 +1229,7 @@ def prepare_for_model(
max_length=max_length,
padding=padding.value,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1754,6 +1770,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1776,6 +1793,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1799,7 +1819,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(encoded_inputs["input_ids"])
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1817,7 +1838,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1836,7 +1857,7 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/udop/tokenization_udop.py b/src/transformers/models/udop/tokenization_udop.py
index 4be9799819168c..e40c07a58aceb7 100644
--- a/src/transformers/models/udop/tokenization_udop.py
+++ b/src/transformers/models/udop/tokenization_udop.py
@@ -551,6 +551,7 @@ def call_boxes(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -654,6 +655,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -676,6 +678,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -704,6 +707,7 @@ def batch_encode_plus_boxes(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -746,6 +750,7 @@ def batch_encode_plus_boxes(
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -813,6 +818,7 @@ def encode_plus_boxes(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -865,6 +871,7 @@ def encode_plus_boxes(
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -892,6 +899,7 @@ def _batch_encode_plus_boxes(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -920,6 +928,7 @@ def _batch_encode_plus_boxes(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -944,6 +953,7 @@ def _batch_prepare_for_model_boxes(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -975,6 +985,7 @@ def _batch_prepare_for_model_boxes(
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
+ padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -995,6 +1006,7 @@ def _batch_prepare_for_model_boxes(
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1014,6 +1026,7 @@ def _encode_plus_boxes(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1044,6 +1057,7 @@ def _encode_plus_boxes(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -1067,6 +1081,7 @@ def prepare_for_model_boxes(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -1240,6 +1255,7 @@ def prepare_for_model_boxes(
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -1385,6 +1401,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -1407,6 +1424,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side (`str`, *optional*):
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -1430,7 +1450,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -1444,7 +1465,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -1459,6 +1480,6 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/udop/tokenization_udop_fast.py b/src/transformers/models/udop/tokenization_udop_fast.py
index 8340c4af4e2bb7..8ee0577fa10e58 100644
--- a/src/transformers/models/udop/tokenization_udop_fast.py
+++ b/src/transformers/models/udop/tokenization_udop_fast.py
@@ -286,6 +286,7 @@ def call_boxes(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -389,6 +390,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -411,6 +413,7 @@ def _is_valid_text_input(t):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -453,6 +456,7 @@ def batch_encode_plus_boxes(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -501,6 +505,7 @@ def batch_encode_plus_boxes(
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -528,6 +533,7 @@ def _batch_encode_plus_boxes(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -548,6 +554,7 @@ def _batch_encode_plus_boxes(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
)
if is_pair:
@@ -684,6 +691,7 @@ def _encode_plus_boxes(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -712,6 +720,7 @@ def _encode_plus_boxes(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -794,6 +803,7 @@ def encode_plus_boxes(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -846,6 +856,7 @@ def encode_plus_boxes(
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -864,6 +875,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -886,6 +898,9 @@ def _pad(
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side (`str`, *optional*):
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -909,7 +924,8 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
- if self.padding_side == "right":
+ padding_side = padding_side if padding_side is not None else self.padding_side
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -923,7 +939,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -938,7 +954,7 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+ raise ValueError("Invalid padding strategy:" + str(padding_side))
return encoded_inputs
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index 647b18521d0515..c1a333fe48c6b4 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -781,6 +781,7 @@ def __call__(
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
**kwargs,
@@ -794,6 +795,10 @@ def __call__(
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
values, a list of numpy array or a list of list of float values. Must be mono channel audio, not
stereo, i.e. single float per timestep.
+
+ padding_side (`str`, *optional*):
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
"""
is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
@@ -825,6 +830,7 @@ def __call__(
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=self.return_attention_mask,
return_tensors=return_tensors,
verbose=verbose,
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index f04eaae4525de9..6a5bff3679f8aa 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -749,6 +749,7 @@ def _encode_plus(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -806,6 +807,7 @@ def get_input_ids(text):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
@@ -833,6 +835,7 @@ def _batch_encode_plus(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -891,6 +894,7 @@ def get_input_ids(text):
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -913,6 +917,7 @@ def _batch_prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -942,6 +947,7 @@ def _batch_prepare_for_model(
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # we pad in batch afterward
+ padding_side=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
@@ -963,6 +969,7 @@ def _batch_prepare_for_model(
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 5e9170456a07ea..93dea5ba09de36 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1427,6 +1427,9 @@ def all_special_ids(self) -> List[int]:
If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side (`str`, *optional*):
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors instead of list of python integers. Acceptable values are:
@@ -2767,6 +2770,7 @@ def encode(
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> List[int]:
@@ -2793,6 +2797,7 @@ def encode(
truncation=truncation,
max_length=max_length,
stride=stride,
+ padding_side=padding_side,
return_tensors=return_tensors,
**kwargs,
)
@@ -2956,6 +2961,7 @@ def __call__(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -2997,6 +3003,7 @@ def __call__(
"stride": stride,
"is_split_into_words": is_split_into_words,
"pad_to_multiple_of": pad_to_multiple_of,
+ "padding_side": padding_side,
"return_tensors": return_tensors,
"return_token_type_ids": return_token_type_ids,
"return_attention_mask": return_attention_mask,
@@ -3041,6 +3048,7 @@ def _call_one(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3111,6 +3119,7 @@ def _is_valid_text_input(t):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -3133,6 +3142,7 @@ def _is_valid_text_input(t):
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -3157,6 +3167,7 @@ def encode_plus(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3207,6 +3218,7 @@ def encode_plus(
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -3230,6 +3242,7 @@ def _encode_plus(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3261,6 +3274,7 @@ def batch_encode_plus(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3307,6 +3321,7 @@ def batch_encode_plus(
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
@@ -3336,6 +3351,7 @@ def _batch_encode_plus(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3361,6 +3377,7 @@ def pad(
padding: Union[bool, str, PaddingStrategy] = True,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
verbose: bool = True,
@@ -3409,6 +3426,9 @@ def pad(
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side (`str`, *optional*):
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask (`bool`, *optional*):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific tokenizer's default, defined by the `return_outputs` attribute.
@@ -3491,6 +3511,7 @@ def pad(
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
@@ -3512,6 +3533,7 @@ def pad(
max_length=max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -3573,6 +3595,7 @@ def prepare_for_model(
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -3686,6 +3709,7 @@ def prepare_for_model(
max_length=max_length,
padding=padding_strategy.value,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_attention_mask=return_attention_mask,
)
@@ -3828,6 +3852,7 @@ def _pad(
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
@@ -3843,13 +3868,16 @@ def _pad(
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
- The tokenizer padding sides are defined in self.padding_side:
+ The tokenizer padding sides are defined in `padding_side` argument:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
+ padding_side:
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
@@ -3873,8 +3901,9 @@ def _pad(
if needs_to_be_padded:
difference = max_length - len(required_input)
+ padding_side = padding_side if padding_side is not None else self.padding_side
- if self.padding_side == "right":
+ if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
if "token_type_ids" in encoded_inputs:
@@ -3884,7 +3913,7 @@ def _pad(
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
- elif self.padding_side == "left":
+ elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
@@ -3895,7 +3924,7 @@ def _pad(
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
else:
- raise ValueError(f"Invalid padding strategy:{self.padding_side}")
+ raise ValueError(f"Invalid padding strategy:{padding_side}")
return encoded_inputs
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 7d5446d7cbf233..724484b3b30b88 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -429,6 +429,7 @@ def set_truncation_and_padding(
max_length: int,
stride: int,
pad_to_multiple_of: Optional[int],
+ padding_side: Optional[bool],
):
"""
Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
@@ -450,6 +451,9 @@ def set_truncation_and_padding(
pad_to_multiple_of (`int`, *optional*):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+ padding_side (`str`, *optional*):
+ The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+ Default value is picked from the class attribute of the same name.
"""
_truncation = self._tokenizer.truncation
_padding = self._tokenizer.padding
@@ -484,7 +488,7 @@ def set_truncation_and_padding(
length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
target = {
"length": length,
- "direction": self.padding_side,
+ "direction": padding_side if padding_side is not None else self.padding_side,
"pad_id": self.pad_token_id,
"pad_token": self.pad_token,
"pad_type_id": self.pad_token_type_id,
@@ -505,6 +509,7 @@ def _batch_encode_plus(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[str] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -527,6 +532,7 @@ def _batch_encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
)
if self._tokenizer.encode_special_tokens != split_special_tokens:
@@ -593,6 +599,7 @@ def _encode_plus(
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
+ padding_side: Optional[bool] = None,
return_tensors: Optional[bool] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
@@ -614,6 +621,7 @@ def _encode_plus(
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
+ padding_side=padding_side,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index bb526e140e5740..19a6aeec46f935 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -21,6 +21,8 @@
import unittest
from typing import List
+from parameterized import parameterized
+
from transformers import (
AddedToken,
LayoutLMv2TokenizerFast,
@@ -393,7 +395,8 @@ def test_right_and_left_truncation(self):
def test_split_special_tokens(self):
pass
- def test_encode_plus_with_padding(self):
+ @parameterized.expand([(True,), (False,)])
+ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -444,15 +447,18 @@ def test_encode_plus_with_padding(self):
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
# Test right padding
- tokenizer.padding_side = "right"
+ tokenizer_kwargs_right = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "right"
+ else:
+ tokenizer_kwargs_right["padding_side"] = "right"
- right_padded_sequence = tokenizer.encode_plus(
- words,
- boxes=boxes,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -463,14 +469,18 @@ def test_encode_plus_with_padding(self):
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
# Test left padding
- tokenizer.padding_side = "left"
- left_padded_sequence = tokenizer.encode_plus(
- words,
- boxes=boxes,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ tokenizer_kwargs_left = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "left"
+ else:
+ tokenizer_kwargs_left["padding_side"] = "left"
+
+ left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 5ea384f0b26422..007e23430b3a56 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -22,6 +22,8 @@
import unittest
from typing import List
+from parameterized import parameterized
+
from transformers import (
AddedToken,
LayoutLMv3TokenizerFast,
@@ -273,7 +275,8 @@ def test_right_and_left_truncation(self):
def test_split_special_tokens(self):
pass
- def test_encode_plus_with_padding(self):
+ @parameterized.expand([(True,), (False,)])
+ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -324,15 +327,18 @@ def test_encode_plus_with_padding(self):
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
# Test right padding
- tokenizer.padding_side = "right"
+ tokenizer_kwargs_right = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "right"
+ else:
+ tokenizer_kwargs_right["padding_side"] = "right"
- right_padded_sequence = tokenizer.encode_plus(
- words,
- boxes=boxes,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -343,14 +349,18 @@ def test_encode_plus_with_padding(self):
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
# Test left padding
- tokenizer.padding_side = "left"
- left_padded_sequence = tokenizer.encode_plus(
- words,
- boxes=boxes,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ tokenizer_kwargs_left = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "left"
+ else:
+ tokenizer_kwargs_left["padding_side"] = "left"
+
+ left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index c0e44fcb30491f..8acd3716cf576b 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -19,6 +19,8 @@
import unittest
from typing import List
+from parameterized import parameterized
+
from transformers import (
AddedToken,
LayoutXLMTokenizerFast,
@@ -324,7 +326,8 @@ def test_encode_decode_with_spaces(self):
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
self.assertIn(decoded, [output, output.lower()])
- def test_encode_plus_with_padding(self):
+ @parameterized.expand([(True,), (False,)])
+ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -375,15 +378,18 @@ def test_encode_plus_with_padding(self):
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
# Test right padding
- tokenizer.padding_side = "right"
+ tokenizer_kwargs_right = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "right"
+ else:
+ tokenizer_kwargs_right["padding_side"] = "right"
- right_padded_sequence = tokenizer.encode_plus(
- words,
- boxes=boxes,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -394,14 +400,18 @@ def test_encode_plus_with_padding(self):
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
# Test left padding
- tokenizer.padding_side = "left"
- left_padded_sequence = tokenizer.encode_plus(
- words,
- boxes=boxes,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ tokenizer_kwargs_left = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "left"
+ else:
+ tokenizer_kwargs_left["padding_side"] = "left"
+
+ left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index 458df94ec2fbcc..fcdde2eb8a874b 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -22,6 +22,8 @@
import unittest
from typing import List
+from parameterized import parameterized
+
from transformers import (
AddedToken,
MarkupLMTokenizerFast,
@@ -211,7 +213,8 @@ def test_encode_decode_with_spaces(self):
def test_right_and_left_truncation(self):
pass
- def test_encode_plus_with_padding(self):
+ @parameterized.expand([(True,), (False,)])
+ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -262,15 +265,18 @@ def test_encode_plus_with_padding(self):
self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
# Test right padding
- tokenizer.padding_side = "right"
+ tokenizer_kwargs_right = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "right"
+ else:
+ tokenizer_kwargs_right["padding_side"] = "right"
- right_padded_sequence = tokenizer.encode_plus(
- nodes,
- xpaths=xpaths,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ right_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -281,14 +287,18 @@ def test_encode_plus_with_padding(self):
self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
# Test left padding
- tokenizer.padding_side = "left"
- left_padded_sequence = tokenizer.encode_plus(
- nodes,
- xpaths=xpaths,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ tokenizer_kwargs_left = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "left"
+ else:
+ tokenizer_kwargs_left["padding_side"] = "left"
+
+ left_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)
diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index a9b8e9a0c77fa6..49327a39cd80d3 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -21,6 +21,7 @@
import numpy as np
import pandas as pd
+from parameterized import parameterized
from transformers import AddedToken, is_torch_available
from transformers.models.tapas.tokenization_tapas import (
@@ -494,7 +495,8 @@ def test_encode_decode_with_spaces(self):
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
self.assertIn(decoded, [output, output.lower()])
- def test_encode_plus_with_padding(self):
+ @parameterized.expand([(True,), (False,)])
+ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -547,15 +549,18 @@ def test_encode_plus_with_padding(self):
assert special_tokens_mask == not_padded_special_tokens_mask
# Test right padding
- tokenizer.padding_side = "right"
+ tokenizer_kwargs_right = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "right"
+ else:
+ tokenizer_kwargs_right["padding_side"] = "right"
- right_padded_sequence = tokenizer.encode_plus(
- table,
- sequence,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ right_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -566,14 +571,18 @@ def test_encode_plus_with_padding(self):
assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
# Test left padding
- tokenizer.padding_side = "left"
- left_padded_sequence = tokenizer.encode_plus(
- table,
- sequence,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ tokenizer_kwargs_left = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "left"
+ else:
+ tokenizer_kwargs_left["padding_side"] = "left"
+
+ left_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 64c860e3fc177d..342254dfbdf066 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -2225,7 +2225,15 @@ def test_padding_with_attention_mask(self):
else:
self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]])
- def test_encode_plus_with_padding(self):
+ @parameterized.expand([(True,), (False,)])
+ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
+ """
+ This test checks that padding works as expected when tokenizing a sequence.
+ Padding is expected to have no effect when the input is a single sequence and
+ the padding-strategy is not `max_length`. Otherwise it pads to the specified max-length
+ using tokenizer classes `padding_side` attribute. Also, we check that passing `padding_side`
+ as call time kwarg works same way as when one sets `tokenizer.padding_side` attribute.
+ """
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -2244,8 +2252,6 @@ def test_encode_plus_with_padding(self):
sequence_length = len(input_ids)
# Test 'longest' and 'no_padding' don't do anything
- tokenizer.padding_side = "right"
-
not_padded_sequence = tokenizer.encode_plus(
sequence,
padding=True,
@@ -2275,14 +2281,18 @@ def test_encode_plus_with_padding(self):
self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask)
# Test right padding
- tokenizer.padding_side = "right"
+ tokenizer_kwargs_right = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
- right_padded_sequence = tokenizer.encode_plus(
- sequence,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "right"
+ else:
+ tokenizer_kwargs_right["padding_side"] = "right"
+
+ right_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_right)
right_padded_input_ids = right_padded_sequence["input_ids"]
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -2293,13 +2303,18 @@ def test_encode_plus_with_padding(self):
self.assertEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
# Test left padding
- tokenizer.padding_side = "left"
- left_padded_sequence = tokenizer.encode_plus(
- sequence,
- max_length=sequence_length + padding_size,
- padding="max_length",
- return_special_tokens_mask=True,
- )
+ tokenizer_kwargs_left = {
+ "max_length": sequence_length + padding_size,
+ "padding": "max_length",
+ "return_special_tokens_mask": True,
+ }
+
+ if not use_padding_as_call_kwarg:
+ tokenizer.padding_side = "left"
+ else:
+ tokenizer_kwargs_left["padding_side"] = "left"
+
+ left_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_left)
left_padded_input_ids = left_padded_sequence["input_ids"]
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
left_padded_sequence_length = len(left_padded_input_ids)
From 7a5659872a68ce9939c975b5727e5ac61136f256 Mon Sep 17 00:00:00 2001
From: Alvaro Moran <6949769+tengomucho@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:19:06 +0200
Subject: [PATCH 02/67] Mitigate a conflict when using sentencepiece (#33327)
* test(tokenizers): add a test showing conflict with sentencepiece
This is due to the fact that protobuf C implementation uses a global
pool for all added descriptors, so if two different files add
descriptors, they will end up conflicting.
* fix(tokenizers): mitigate sentencepiece/protobuf conflict
When sentencepiece is available, use that protobuf instead of the
internal one.
* chore(style): fix with ruff
---
src/transformers/convert_slow_tokenizer.py | 6 +++++-
tests/tokenization/test_tokenization_utils.py | 20 ++++++++++++++++++-
2 files changed, 24 insertions(+), 2 deletions(-)
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index f2064a131dad42..eb75a46a6d9bf2 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -26,7 +26,7 @@
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece
-from .utils import is_protobuf_available, logging, requires_backends
+from .utils import is_protobuf_available, is_sentencepiece_available, logging, requires_backends
from .utils.import_utils import PROTOBUF_IMPORT_ERROR
@@ -34,6 +34,10 @@
def import_protobuf(error_message=""):
+ if is_sentencepiece_available():
+ from sentencepiece import sentencepiece_model_pb2
+
+ return sentencepiece_model_pb2
if is_protobuf_available():
import google.protobuf
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index f97ef6a630221d..b43923df84d712 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -35,7 +35,15 @@
is_tokenizers_available,
)
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-from transformers.testing_utils import CaptureStderr, require_flax, require_tf, require_tokenizers, require_torch, slow
+from transformers.testing_utils import (
+ CaptureStderr,
+ require_flax,
+ require_sentencepiece,
+ require_tf,
+ require_tokenizers,
+ require_torch,
+ slow,
+)
if is_tokenizers_available():
@@ -296,3 +304,13 @@ def test_len_tokenizer(self):
self.assertEqual(len(tokenizer), tokenizer.vocab_size + 1)
self.assertEqual(len(tokenizer.added_tokens_decoder), added_tokens_size + 1)
self.assertEqual(len(tokenizer.added_tokens_encoder), added_tokens_size + 1)
+
+ @require_sentencepiece
+ def test_sentencepiece_cohabitation(self):
+ from sentencepiece import sentencepiece_model_pb2 as _original_protobuf # noqa: F401
+
+ from transformers.convert_slow_tokenizer import import_protobuf # noqa: F401
+
+ # Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf
+ # was already imported.
+ import_protobuf()
From dfd31158eefab01952e729588a37c9fcc81f0813 Mon Sep 17 00:00:00 2001
From: Amit Garg
Date: Fri, 13 Sep 2024 05:07:19 -0700
Subject: [PATCH 03/67] [Phi-3] Bug on stale kv cache (#33129)
* fix long seq bug
* fixed format
* fixed fn copy inconsistency
* fix long seq bug
* fixed format
* fixed fn copy inconsistency
* Addressed comments
* added a unit test
* fixed cache position
* Added a warning msg to the forward fn
* fixed test case
---
src/transformers/models/phi3/modeling_phi3.py | 23 ++++++++++-
tests/models/phi3/test_modeling_phi3.py | 41 +++++++++++++++++++
2 files changed, 62 insertions(+), 2 deletions(-)
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index f021c6ce2d339d..273b6a8f505e79 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -257,7 +257,7 @@ def __init__(self, dim, config, device=None):
@torch.no_grad()
def forward(self, x, position_ids, seq_len=None):
- seq_len = torch.max(position_ids) + 1
+ seq_len = seq_len or torch.max(position_ids) + 1
if seq_len > self.original_max_position_embeddings:
ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
else:
@@ -1239,6 +1239,15 @@ def forward(
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
```"""
+ if (
+ use_cache
+ and self.config.rope_scaling
+ and cache_position is not None
+ and cache_position[0] == self.config.original_max_position_embeddings
+ ):
+ logger.warning(
+ f"If you are not using the generate method, you may encounter nonsensical outputs after the {self.config.original_max_position_embeddings}th token, as the KV cache needs to be recomputed."
+ )
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
@@ -1295,7 +1304,6 @@ def forward(
attentions=outputs.attentions,
)
- # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
def prepare_inputs_for_generation(
self,
input_ids,
@@ -1308,6 +1316,17 @@ def prepare_inputs_for_generation(
num_logits_to_keep=None,
**kwargs,
):
+ # When the first time input length reached long and short factor switching point, enforce re-compute cache
+ # It will cause downside of slower at this single token position, however, better than current failure.
+ if (
+ past_key_values
+ and self.config.rope_scaling
+ and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
+ ):
+ past_length = cache_position[0]
+ if past_length <= self.config.original_max_position_embeddings:
+ past_key_values = None
+
# If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
# Exception 1: when passing input_embeds, input_ids may be missing entries
# Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
index a3f001aba467a0..ce0a71878877b5 100644
--- a/tests/models/phi3/test_modeling_phi3.py
+++ b/tests/models/phi3/test_modeling_phi3.py
@@ -442,6 +442,47 @@ def test_model_rope_scaling_from_config(self, scaling_type):
self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
+ @parameterized.expand([("longrope",)])
+ def test_model_rope_scaling_short_long_factor(self, scaling_type):
+ config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+ n_factors = config.hidden_size // config.num_key_value_heads // 2
+ config.rope_scaling = {
+ "type": scaling_type,
+ "short_factor": [3.0 for _ in range(n_factors)],
+ "long_factor": [5.0 for _ in range(n_factors)],
+ }
+ input_tensor = ids_tensor([1, 4090], config.vocab_size)
+ model = Phi3ForCausalLM(config)
+ model.to(torch_device)
+ model.eval()
+ generation_args_short = {
+ "max_length": config.original_max_position_embeddings,
+ "temperature": 0.0,
+ "use_cache": True,
+ "do_sample": False,
+ "return_dict_in_generate": True,
+ }
+ output_with_short_factor = model.generate(input_tensor, **generation_args_short)
+ keys_with_short_factor = output_with_short_factor.past_key_values[0][0]
+ generation_args_long = {
+ "max_length": config.original_max_position_embeddings + 5,
+ "temperature": 0.0,
+ "use_cache": True,
+ "do_sample": False,
+ "return_dict_in_generate": True,
+ "output_logits": True,
+ }
+ output_with_long_factor = model.generate(input_tensor, **generation_args_long)
+ keys_with_long_factor = output_with_long_factor.past_key_values[0][0]
+ last_token_logits = output_with_long_factor.logits[-1][-1]
+ regenerated_last_token_logits = model(output_with_long_factor.sequences[:, :-1]).logits[0][-1]
+ keys_with_long_factor = keys_with_long_factor[:, :, : config.original_max_position_embeddings - 1, :]
+
+ # KV cache is re-computed after reaching the (`config.original_max_position_embeddings`+1)th token position
+ self.assertFalse(torch.allclose(keys_with_short_factor, keys_with_long_factor, atol=1e-2, rtol=1e-2))
+ # Last token generated using long factor
+ self.assertTrue(torch.allclose(last_token_logits, regenerated_last_token_logits, atol=1e-2, rtol=1e-2))
+
@slow
@require_torch
From 6cc4dfe3f1e8d421c6d6351388e06e9b123cbfe1 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Fri, 13 Sep 2024 15:06:08 +0200
Subject: [PATCH 04/67] Fix the initialization of the cache when we have multi
gpu (#33303)
* init cache multi-gpu
* Update src/transformers/generation/utils.py
Co-authored-by: Joao Gante
* switch to execution device map
* naming more consistant
* fix
* mutually exclusive device
* added an integration example
* remove useless check
* suggestion from joao + typing
* fix couple of typo and add test
* revert check
---------
Co-authored-by: Joao Gante
---
src/transformers/cache_utils.py | 40 +++++++++----
src/transformers/generation/utils.py | 27 +++++++++
tests/generation/test_utils.py | 85 ++++++++++++++++++++++++++++
3 files changed, 141 insertions(+), 11 deletions(-)
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index b3e94da3d7d7bd..0671157e447038 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -1030,6 +1030,9 @@ class StaticCache(Cache):
The device on which the cache should be initialized. Should be the same as the layer.
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
The default `dtype` to use when initializing the layer.
+ layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
+ Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
+ You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
Example:
@@ -1060,6 +1063,7 @@ def __init__(
device: torch.device = None,
dtype: torch.dtype = torch.float32,
max_batch_size: Optional[int] = None,
+ layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
) -> None:
super().__init__()
if max_batch_size is not None:
@@ -1088,16 +1092,20 @@ def __init__(
# Note: There will be significant perf decrease if switching to use 5D tensors instead.
cache_shape = (self.batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
for idx in range(config.num_hidden_layers):
- new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
- new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
+ if layer_device_map is not None:
+ layer_device = layer_device_map[idx]
+ else:
+ layer_device = device
+ new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
+ new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
# Notes:
# 1. `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
# breaks when updating the cache. It can't be used if the cache code is being compiled (but in that case
# it is not needed anyway)
# 2. `torch.export()` requires mutations to be registered as buffers.
if not is_torchdynamo_compiling():
- self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=device))
- self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=device))
+ self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
+ self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
new_layer_key_cache = getattr(self, f"key_cache_{idx}")
new_layer_value_cache = getattr(self, f"value_cache_{idx}")
torch._dynamo.mark_static_address(new_layer_key_cache)
@@ -1130,9 +1138,9 @@ def update(
Return:
A tuple containing the updated key and value states.
"""
+
cache_position = cache_kwargs.get("cache_position")
- self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device=key_states.device)
- self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device=value_states.device)
+
k_out = self.key_cache[layer_idx]
v_out = self.value_cache[layer_idx]
@@ -1201,6 +1209,9 @@ class SlidingWindowCache(StaticCache):
The device on which the cache should be initialized. Should be the same as the layer.
dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
The default `dtype` to use when initializing the layer.
+ layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
+ Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
+ You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
Example:
@@ -1231,6 +1242,7 @@ def __init__(
device: torch.device = None,
dtype: torch.dtype = torch.float32,
max_batch_size: Optional[int] = None,
+ layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
) -> None:
super().__init__()
if not hasattr(config, "sliding_window") or config.sliding_window is None:
@@ -1247,6 +1259,7 @@ def __init__(
device=device,
dtype=dtype,
max_batch_size=max_batch_size,
+ layer_device_map=layer_device_map,
)
def update(
@@ -1280,7 +1293,6 @@ def update(
v_out = v_out[:, :, indices]
try:
- cache_position.to(device=k_out.device)
k_out.index_copy_(2, cache_position, key_states)
v_out.index_copy_(2, cache_position, value_states)
except NotImplementedError:
@@ -1495,6 +1507,9 @@ class HybridCache(Cache):
The device on which the cache should be initialized. Should be the same as the layer.
dtype (torch.dtype, *optional*, defaults to `torch.float32`):
The default `dtype` to use when initializing the layer.
+ layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
+ Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
+ You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
Example:
@@ -1525,6 +1540,7 @@ def __init__(
device: Union[torch.device, str] = "cpu",
dtype: torch.dtype = torch.float32,
max_batch_size: Optional[int] = None,
+ layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
) -> None:
super().__init__()
if max_batch_size is not None:
@@ -1562,11 +1578,15 @@ def __init__(
self.head_dim,
)
for i in range(config.num_hidden_layers):
+ if layer_device_map is not None:
+ layer_device = layer_device_map[i]
+ else:
+ layer_device = device
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
# breaks when updating the cache.
cache_shape = global_cache_shape if not self.is_sliding[i] else sliding_cache_shape
- new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
- new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
+ new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
+ new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
torch._dynamo.mark_static_address(new_layer_key_cache)
torch._dynamo.mark_static_address(new_layer_value_cache)
self.key_cache.append(new_layer_key_cache)
@@ -1617,8 +1637,6 @@ def update(
) -> Tuple[torch.Tensor]:
cache_position = cache_kwargs.get("cache_position")
sliding_window = cache_kwargs.get("sliding_window")
- self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device=key_states.device)
- self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device=value_states.device)
k_out = self.key_cache[layer_idx]
v_out = self.value_cache[layer_idx]
if sliding_window:
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 17a234c62b285e..019eb6c27f18cc 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1446,12 +1446,39 @@ def _get_cache(
# models. May cause trobles with non-text modalities.
cache_dtype = self.get_output_embeddings().weight.dtype
+ def get_layer_device_map(execution_device_map: Optional[dict] = None):
+ if execution_device_map is None or len(execution_device_map) <= 1:
+ return None
+ layer_device_map = {}
+ for layer in execution_device_map:
+ for idx in range(self.config.num_hidden_layers):
+ if f".{idx}." in f"{layer}.":
+ layer_device_map[idx] = execution_device_map[layer]
+ break
+ for idx in range(self.config.num_hidden_layers):
+ if idx not in layer_device_map:
+ raise RuntimeError(f"layer {idx} has not been mapped to a device.")
+ return layer_device_map
+
+ execution_device_map = None
+ # Taken from dispatch_model from accelerate.
+ # This is needed here if we don't want to make changes in accelerate in order to save execution_device
+ # For offloaded case, we need to get the execution device, not just the device where it is offloaded
+ if hasattr(self, "hf_device_map"):
+ main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0]
+ execution_device_map = {
+ name: main_device if device in ["cpu", "disk"] else device
+ for name, device in self.hf_device_map.items()
+ }
+ layer_device_map = get_layer_device_map(execution_device_map)
+
cache_kwargs = {
"config": self.config if hasattr(self.config, "text_config") else self.config,
"max_batch_size": batch_size,
"max_cache_len": max_cache_len,
"device": device,
"dtype": cache_dtype,
+ "layer_device_map": layer_device_map,
}
self._cache = cache_cls(**cache_kwargs)
if requires_cross_attention_cache:
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 65507795c84dd8..0ed054ad58696e 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -3444,6 +3444,91 @@ def test_special_tokens_fall_back_to_model_default(self):
self.assertTrue(test_bos_id == gen_output[0, 0])
self.assertTrue(generation_config.bos_token_id is None)
+ @pytest.mark.generate
+ @require_torch_multi_gpu
+ def test_generate_with_static_cache_multi_gpu(self):
+ """
+ Tests if the static cache has been set correctly and if generate works correctly when we are using multi-gpus.
+ """
+ # need to split manually as auto doesn't work well with unbalanced model
+ device_map = {"model.embed_tokens": 0, "model.layers.0": 0, "model.layers.1": 1, "model.norm": 1, "lm_head": 0}
+ model = AutoModelForCausalLM.from_pretrained(
+ "hf-internal-testing/tiny-random-MistralForCausalLM", device_map=device_map
+ )
+ tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
+
+ text = "Hello world"
+ tokenized_inputs = tokenizer([text], return_tensors="pt")
+ input_ids = tokenized_inputs.input_ids.to(torch_device)
+
+ generation_kwargs = {
+ "max_new_tokens": 20,
+ "cache_implementation": "static",
+ "return_dict_in_generate": True, # Required to return `past_key_values`
+ }
+
+ results = model.generate(input_ids, **generation_kwargs)
+ self.assertTrue(isinstance(results.past_key_values, StaticCache))
+
+ # check device of each layer
+ key_cache_0 = results.past_key_values.key_cache[0]
+ value_cache_0 = results.past_key_values.value_cache[0]
+ self.assertTrue(key_cache_0.device == value_cache_0.device == torch.device(0))
+
+ key_cache_1 = results.past_key_values.key_cache[1]
+ value_cache_1 = results.past_key_values.value_cache[1]
+ self.assertTrue(key_cache_1.device == value_cache_1.device == torch.device(1))
+
+ @pytest.mark.generate
+ @require_torch_multi_gpu
+ def test_init_static_cache_multi_gpu(self):
+ """
+ Tests if the static cache has been set correctly when we initialize it manually in a multi-gpu setup.
+ """
+ # need to split manually as auto doesn't work well with unbalanced model
+ device_map = {"model.embed_tokens": 0, "model.layers.0": 0, "model.layers.1": 1, "model.norm": 1, "lm_head": 0}
+ model = AutoModelForCausalLM.from_pretrained(
+ "hf-internal-testing/tiny-random-MistralForCausalLM", device_map=device_map
+ )
+ tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
+
+ text = "Hello world"
+ tokenized_inputs = tokenizer([text], return_tensors="pt")
+ input_ids = tokenized_inputs.input_ids.to(torch_device)
+
+ generation_kwargs = {
+ "max_new_tokens": 20,
+ "return_dict_in_generate": True, # Required to return `past_key_values`
+ }
+
+ # TODO: We need to raise a warning in case the cache is not set correctly
+ # with self.assertRaisesRegex(ValueError, "If you are manually initializing the cache"):
+ # past_key_values = StaticCache(
+ # config=model.config, batch_size=1, max_cache_len=30, device=torch_device, dtype=model.dtype
+ # )
+ # results = model.generate(input_ids, past_key_values=past_key_values, **generation_kwargs)
+
+ # deduced from the device_map : layer 0 on device 0 and layer 1 on device 1
+ layer_device_map = {0: 0, 1: 1}
+ past_key_values = StaticCache(
+ config=model.config,
+ batch_size=1,
+ max_cache_len=30,
+ device=torch_device,
+ dtype=model.dtype,
+ layer_device_map=layer_device_map,
+ )
+ results = model.generate(input_ids, past_key_values=past_key_values, **generation_kwargs)
+
+ # check device of each layer
+ key_cache_0 = results.past_key_values.key_cache[0]
+ value_cache_0 = results.past_key_values.value_cache[0]
+ self.assertTrue(key_cache_0.device == value_cache_0.device == torch.device(0))
+
+ key_cache_1 = results.past_key_values.key_cache[1]
+ value_cache_1 = results.past_key_values.value_cache[1]
+ self.assertTrue(key_cache_1.device == value_cache_1.device == torch.device(1))
+
@require_torch
class TokenHealingTestCase(unittest.TestCase):
From 0963229e287501bed52ae1dabc17922524de6992 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Fri, 13 Sep 2024 15:07:12 +0200
Subject: [PATCH 05/67] Enable finetuning with torchao quantized model
(#33361)
enable training
---
src/transformers/quantizers/quantizer_torchao.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
index 3b5dfff2090499..02ea8294a2d54a 100644
--- a/src/transformers/quantizers/quantizer_torchao.py
+++ b/src/transformers/quantizers/quantizer_torchao.py
@@ -166,7 +166,8 @@ def is_serializable(self):
@property
def is_trainable(self):
- # torchao does not have official support for QAT (Quantization Aware Training)
- # but torchao support nf4/PEFT, but it is not integrated yet
- # TODO: if this is supported in the future, do a version check here.
- return False
+ supported_quant_types_for_training = [
+ "int8_weight_only",
+ "int8_dynamic_activation_int8_weight",
+ ]
+ return self.quantization_config.quant_type in supported_quant_types_for_training
From e39b6c1c7cdc890b6849b8c9de545fc9590ba871 Mon Sep 17 00:00:00 2001
From: Sergio Paniego Blanco
Date: Fri, 13 Sep 2024 17:15:20 +0200
Subject: [PATCH 06/67] Corrected `Agents and tools` documentation links typos
(#33471)
* Corrected agents task link typo
* Corrected chat templating link
* Corrected chat templating link 2
---
docs/source/en/agents.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md
index b100e39f1c9591..0b889f4eec867b 100644
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
### What is an agent?
-Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling.) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to.
+Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to.
One approach to overcome this weakness is to create an *agent*.
@@ -114,7 +114,7 @@ To start with, please install the `agents` extras in order to install all defaul
pip install transformers[agents]
```
-Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating.) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating.
+Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating.
```python
from huggingface_hub import login, InferenceClient
@@ -130,7 +130,7 @@ def llm_engine(messages, stop_sequences=["Task"]) -> str:
```
You could use any `llm_engine` method as long as:
-1. it follows the [messages format](./chat_templating.md) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`.
+1. it follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`.
2. it stops generating outputs at the sequences passed in the argument `stop_sequences`
Additionally, `llm_engine` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to llm_engine, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs.
From 7bb1c99800d235791dace10305731f377db8077b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Lemayian=20=E2=9C=A8?=
<877919+DavidLemayian@users.noreply.github.com>
Date: Sat, 14 Sep 2024 00:25:20 +0300
Subject: [PATCH 07/67] chore: fix typo in comment in
tokenization_utils_base.py (#33466)
docs: update grammar in comment in tokenization_utils_base.py
small grammar update in tokenization_utils_base.py comment
---
src/transformers/tokenization_utils_base.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 93dea5ba09de36..b4490578a70916 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -3457,7 +3457,7 @@ def pad(
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
- # The model's main input name, usually `input_ids`, has be passed for padding
+ # The model's main input name, usually `input_ids`, has been passed for padding
if self.model_input_names[0] not in encoded_inputs:
raise ValueError(
"You should supply an encoding or a list of encodings to this method "
From 8bd2b1e8c23234cd607ca8d63f53c1edfea27462 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Sat, 14 Sep 2024 12:28:39 +0200
Subject: [PATCH 08/67] Add support for Pixtral (#33449)
* initial commit
* gloups
* updates
* work
* weights match
* nits
* nits
* updates to support the tokenizer :)
* updates
* Pixtral processor (#33454)
* rough outline
* Add in image break and end tokens
* Fix
* Udo some formatting changes
* Set patch_size default
* Fix
* Fix token expansion
* nit in conversion script
* Fix image token list creation
* done
* add expected results
* Process list of list of images (#33465)
* updates
* working image and processor
* this is the expected format
* some fixes
* push current updated
* working mult images!
* add a small integration test
* Uodate configuration docstring
* Formatting
* Config docstring fix
* simplify model test
* fixup modeling and etests
* Return BatchMixFeature in image processor
* fix some copies
* update
* nits
* Update model docstring
* Apply suggestions from code review
* Fix up
* updates
* revert modeling changes
* update
* update
* fix load safe
* addd liscence
* update
* use pixel_values as required by the model
* skip some tests and refactor
* Add pixtral image processing tests (#33476)
* Image processing tests
* Add processing tests
* woops
* defaults reflect pixtral image processor
* fixup post merge
* images -> pixel values
* oups sorry Mr docbuilder
* isort
* fix
* fix processor tests
* small fixes
* nit
* update
* last nits
* oups this was really breaking!
* nits
* is composition needs to be true
---------
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
docs/source/en/_toctree.yml | 2 +
docs/source/en/index.md | 1 +
docs/source/en/model_doc/pixtral.md | 98 ++++
src/transformers/__init__.py | 13 +-
src/transformers/models/__init__.py | 1 +
.../models/auto/configuration_auto.py | 2 +
.../models/auto/image_processing_auto.py | 1 +
src/transformers/models/auto/modeling_auto.py | 1 +
.../models/auto/processing_auto.py | 1 +
.../models/auto/tokenization_auto.py | 1 +
.../models/llava/configuration_llava.py | 2 +-
src/transformers/models/pixtral/__init__.py | 70 +++
.../models/pixtral/configuration_pixtral.py | 103 ++++
.../pixtral/convert_pixtral_weights_to_hf.py | 285 ++++++++++
.../pixtral/image_processing_pixtral.py | 519 ++++++++++++++++++
.../models/pixtral/modeling_pixtral.py | 517 +++++++++++++++++
.../models/pixtral/processing_pixtral.py | 282 ++++++++++
src/transformers/utils/dummy_pt_objects.py | 14 +
.../utils/dummy_vision_objects.py | 7 +
tests/models/llava/test_modeling_llava.py | 47 ++
tests/models/pixtral/__init__.py | 0
.../pixtral/test_image_processing_pixtral.py | 217 ++++++++
tests/models/pixtral/test_modeling_pixtral.py | 292 ++++++++++
.../models/pixtral/test_processor_pixtral.py | 233 ++++++++
24 files changed, 2707 insertions(+), 2 deletions(-)
create mode 100644 docs/source/en/model_doc/pixtral.md
create mode 100644 src/transformers/models/pixtral/__init__.py
create mode 100644 src/transformers/models/pixtral/configuration_pixtral.py
create mode 100644 src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
create mode 100644 src/transformers/models/pixtral/image_processing_pixtral.py
create mode 100644 src/transformers/models/pixtral/modeling_pixtral.py
create mode 100644 src/transformers/models/pixtral/processing_pixtral.py
create mode 100644 tests/models/pixtral/__init__.py
create mode 100644 tests/models/pixtral/test_image_processing_pixtral.py
create mode 100644 tests/models/pixtral/test_modeling_pixtral.py
create mode 100644 tests/models/pixtral/test_processor_pixtral.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 1c7f62ec6ea7b8..235ea81a7f1ea6 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -862,6 +862,8 @@
title: Perceiver
- local: model_doc/pix2struct
title: Pix2Struct
+ - local: model_doc/pixtral
+ title: Pixtral
- local: model_doc/sam
title: Segment Anything
- local: model_doc/siglip
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 8e3a4da8b021de..c18426de4c031c 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -253,6 +253,7 @@ Flax), PyTorch, and/or TensorFlow.
| [Phi3](model_doc/phi3) | ✅ | ❌ | ❌ |
| [PhoBERT](model_doc/phobert) | ✅ | ✅ | ✅ |
| [Pix2Struct](model_doc/pix2struct) | ✅ | ❌ | ❌ |
+| [Pixtral](model_doc/pixtral) | ❌ | ❌ | ❌ |
| [PLBart](model_doc/plbart) | ✅ | ❌ | ❌ |
| [PoolFormer](model_doc/poolformer) | ✅ | ❌ | ❌ |
| [Pop2Piano](model_doc/pop2piano) | ✅ | ❌ | ❌ |
diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
new file mode 100644
index 00000000000000..8df2bf5af5f9ca
--- /dev/null
+++ b/docs/source/en/model_doc/pixtral.md
@@ -0,0 +1,98 @@
+
+
+# Pixtral
+
+## Overview
+
+The Pixtral model was released by the Mistral AI team on [Vllm](https://github.com/vllm-project/vllm/pull/8377), where a version of the code can be found!
+
+
+Tips:
+
+- Pixtral is a multimodal model, the main contribution is the 2d ROPE on the images, and support for arbitrary image size (the images are not padded together nor are they resized)
+- This model follows the `Llava` familiy, meaning image embeddings are placed instead of the `[IMG]` token placeholders.
+- The format for one or mulitple prompts is the following:
+```
+"[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
+```
+Then, the processor will replace each `[IMG]` token with a number of `[IMG]` token that depends on the height and the width of the image. Each *row* of the image is separated by a `[IMG_BREAK]` token, and each image is separated by a `[IMG_END]` token.
+
+This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [ArthurZ](https://huggingface.co/ArthurZ)
+
+Here is an example of how to run it:
+
+```python
+from transformers import LlavaForConditionalGeneration, AutoProcessor
+from PIL import Image
+
+model_id = "hf-internal-testing/pixtral-12b"
+model = LlavaForConditionalGeneration.from_pretrained(model_id).to("cuda")
+processor = AutoProcessor.from_pretrained(model_id)
+
+IMG_URLS = [
+ "https://picsum.photos/id/237/400/300",
+ "https://picsum.photos/id/231/200/300",
+ "https://picsum.photos/id/27/500/500",
+ "https://picsum.photos/id/17/150/600",
+]
+PROMPT = "[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
+
+inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
+generate_ids = model.generate(**inputs, max_new_tokens=500)
+ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+EXPECTED_GENERATION = """
+Describe the images.
+Sure, let's break down each image description:
+
+1. **Image 1:**
+ - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
+ - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.
+
+2. **Image 2:**
+ - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
+ - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.
+
+3. **Image 3:**
+ - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
+ - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.
+
+4. **Image 4:**
+ - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
+ - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
+
+Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
+"""
+
+```
+## PixtralVisionConfig
+
+[[autodoc]] PixtralVisionConfig
+
+## PixtralModel
+
+[[autodoc]] PixtralModel
+ - forward
+
+## PixtralImageProcessor
+
+[[autodoc]] PixtralImageProcessor
+ - preprocess
+
+## PixtralProcessor
+
+[[autodoc]] PixtralProcessor
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 00cc67915f3664..36775d8454ab8c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -649,6 +649,7 @@
"Pix2StructTextConfig",
"Pix2StructVisionConfig",
],
+ "models.pixtral": ["PixtralProcessor", "PixtralVisionConfig"],
"models.plbart": ["PLBartConfig"],
"models.poolformer": ["PoolFormerConfig"],
"models.pop2piano": ["Pop2PianoConfig"],
@@ -1199,6 +1200,7 @@
_import_structure["models.owlvit"].extend(["OwlViTFeatureExtractor", "OwlViTImageProcessor"])
_import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
_import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"])
+ _import_structure["models.pixtral"].append("PixtralImageProcessor")
_import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
_import_structure["models.pvt"].extend(["PvtImageProcessor"])
_import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"])
@@ -1359,7 +1361,6 @@
"AlignVisionModel",
]
)
-
_import_structure["models.altclip"].extend(
[
"AltCLIPModel",
@@ -2977,6 +2978,7 @@
"Pix2StructVisionModel",
]
)
+ _import_structure["models.pixtral"].extend(["PixtralModel", "PixtralPreTrainedModel"])
_import_structure["models.plbart"].extend(
[
"PLBartForCausalLM",
@@ -5434,6 +5436,10 @@
Pix2StructTextConfig,
Pix2StructVisionConfig,
)
+ from .models.pixtral import (
+ PixtralProcessor,
+ PixtralVisionConfig,
+ )
from .models.plbart import PLBartConfig
from .models.poolformer import (
PoolFormerConfig,
@@ -6009,6 +6015,7 @@
from .models.owlvit import OwlViTFeatureExtractor, OwlViTImageProcessor
from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor
from .models.pix2struct import Pix2StructImageProcessor
+ from .models.pixtral import PixtralImageProcessor
from .models.poolformer import (
PoolFormerFeatureExtractor,
PoolFormerImageProcessor,
@@ -7448,6 +7455,10 @@
Pix2StructTextModel,
Pix2StructVisionModel,
)
+ from .models.pixtral import (
+ PixtralModel,
+ PixtralPreTrainedModel,
+ )
from .models.plbart import (
PLBartForCausalLM,
PLBartForConditionalGeneration,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 26b96def67d992..2022048cd4553f 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -187,6 +187,7 @@
phi3,
phobert,
pix2struct,
+ pixtral,
plbart,
poolformer,
pop2piano,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index fa1a7fb88eafa8..2cd7d550d90b7a 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -205,6 +205,7 @@
("phi", "PhiConfig"),
("phi3", "Phi3Config"),
("pix2struct", "Pix2StructConfig"),
+ ("pixtral", "PixtralVisionConfig"),
("plbart", "PLBartConfig"),
("poolformer", "PoolFormerConfig"),
("pop2piano", "Pop2PianoConfig"),
@@ -509,6 +510,7 @@
("phi3", "Phi3"),
("phobert", "PhoBERT"),
("pix2struct", "Pix2Struct"),
+ ("pixtral", "Pixtral"),
("plbart", "PLBart"),
("poolformer", "PoolFormer"),
("pop2piano", "Pop2Piano"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index c83c43518a6a31..95d9ddef8f7979 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -114,6 +114,7 @@
("owlvit", ("OwlViTImageProcessor",)),
("perceiver", ("PerceiverImageProcessor",)),
("pix2struct", ("Pix2StructImageProcessor",)),
+ ("pixtral", ("PixtralImageProcessor",)),
("poolformer", ("PoolFormerImageProcessor",)),
("pvt", ("PvtImageProcessor",)),
("pvt_v2", ("PvtImageProcessor",)),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 45a9c4d0d078b7..e0d15f1e236590 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -193,6 +193,7 @@
("persimmon", "PersimmonModel"),
("phi", "PhiModel"),
("phi3", "Phi3Model"),
+ ("pixtral", "PixtralModel"),
("plbart", "PLBartModel"),
("poolformer", "PoolFormerModel"),
("prophetnet", "ProphetNetModel"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 7f49e0e8d99730..82d325248eabfb 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -82,6 +82,7 @@
("owlvit", "OwlViTProcessor"),
("paligemma", "PaliGemmaProcessor"),
("pix2struct", "Pix2StructProcessor"),
+ ("pixtral", "PixtralProcessor"),
("pop2piano", "Pop2PianoProcessor"),
("qwen2_audio", "Qwen2AudioProcessor"),
("qwen2_vl", "Qwen2VLProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index c8eb06db04a098..e735579108d857 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -385,6 +385,7 @@
("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
("phobert", ("PhobertTokenizer", None)),
("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
+ ("pixtral", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
("prophetnet", ("ProphetNetTokenizer", None)),
("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py
index f2338a7c5a5df7..3a4cb09855f0ec 100644
--- a/src/transformers/models/llava/configuration_llava.py
+++ b/src/transformers/models/llava/configuration_llava.py
@@ -73,7 +73,7 @@ class LlavaConfig(PretrainedConfig):
```"""
model_type = "llava"
- is_composition = False
+ is_composition = True
def __init__(
self,
diff --git a/src/transformers/models/pixtral/__init__.py b/src/transformers/models/pixtral/__init__.py
new file mode 100644
index 00000000000000..e09ed8e60127dd
--- /dev/null
+++ b/src/transformers/models/pixtral/__init__.py
@@ -0,0 +1,70 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+ "configuration_pixtral": ["PixtralVisionConfig"],
+ "processing_pixtral": ["PixtralProcessor"],
+}
+
+
+try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ pass
+else:
+ _import_structure["modeling_pixtral"] = [
+ "PixtralModel",
+ "PixtralPreTrainedModel",
+ ]
+
+try:
+ if not is_vision_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ pass
+else:
+ _import_structure["image_processing_pixtral"] = ["PixtralImageProcessor"]
+
+
+if TYPE_CHECKING:
+ from .configuration_pixtral import PixtralProcessor, PixtralVisionConfig
+
+ try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+ except OptionalDependencyNotAvailable:
+ pass
+ else:
+ from .modeling_pixtral import (
+ PixtralModel,
+ PixtralPreTrainedModel,
+ )
+
+ try:
+ if not is_vision_available():
+ raise OptionalDependencyNotAvailable()
+ except OptionalDependencyNotAvailable:
+ pass
+ else:
+ from .image_processing_pixtral import PixtralImageProcessor
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
new file mode 100644
index 00000000000000..dcc1e458ca78a3
--- /dev/null
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pixtral model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class PixtralVisionConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`PixtralModel`]. It is used to instantiate an
+ Pixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the Pixtral-9B.
+
+ e.g. [pixtral-hf/pixtral-9b](https://huggingface.co/pixtral-hf/pixtral-9b)
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ hidden_size (`int`, *optional*, defaults to 1024):
+ Dimension of the hidden representations.
+ intermediate_size (`int`, *optional*, defaults to 4096):
+ Dimension of the MLP representations.
+ num_hidden_layers (`int`, *optional*, defaults to 24):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 16):
+ Number of attention heads in the Transformer encoder.
+ num_channels (`int`, *optional*, defaults to 3):
+ Number of input channels in the input images.
+ image_size (`int`, *optional*, defaults to 1024):
+ Max dimension of the input images.
+ patch_size (`int`, *optional*, defaults to 16):
+ Size of the image patches.
+ hidden_act (`str`, *optional*, defaults to `"gelu"`):
+ Activation function used in the hidden layers.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ Dropout probability for the attention layers.
+ rope_theta (`float`, *optional*, defaults to 10000.0):
+ The base period of the RoPE embeddings.
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+ Whether to tie the word embeddings with the input embeddings.
+
+ Example:
+
+ ```python
+ >>> from transformers import PixtralModel, PixtralVisionConfig, CLIPVisionConfig, LlamaConfig
+
+ >>> # Initializing a Pixtral 12B style configuration
+ >>> config = PixtralVisionConfig()
+
+ >>> # Initializing a model from the pixtral 12B style configuration
+ >>> model = PixtralModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "pixtral"
+
+ def __init__(
+ self,
+ hidden_size=1024,
+ intermediate_size=4096,
+ num_hidden_layers=24,
+ num_attention_heads=16,
+ num_channels=3,
+ image_size=1024,
+ patch_size=16,
+ hidden_act="gelu",
+ attention_dropout=0.0,
+ rope_theta=10000.0,
+ tie_word_embeddings=False,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.num_channels = num_channels
+ self.patch_size = patch_size
+ self.image_size = image_size
+ self.attention_dropout = attention_dropout
+ self.hidden_act = hidden_act
+ self.rope_theta = rope_theta
+ self.tie_word_embeddings = tie_word_embeddings
+ self.head_dim = hidden_size // num_attention_heads
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
new file mode 100644
index 00000000000000..c4190082d99471
--- /dev/null
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -0,0 +1,285 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import regex as re
+import torch
+from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+from safetensors.torch import load_file as safe_load_file
+from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
+
+from transformers import (
+ LlavaConfig,
+ LlavaForConditionalGeneration,
+ MistralConfig,
+ PixtralImageProcessor,
+ PixtralProcessor,
+ PixtralVisionConfig,
+ PreTrainedTokenizerFast,
+)
+from transformers.convert_slow_tokenizer import bytes_to_unicode
+
+
+"""
+# Here is how to get the original tokens!
+model_name = "mistralai/Pixtral-12B-2409"
+tok = MistralTokenizer.from_model(model_name)
+
+from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk
+
+EXPECTED_TOKENS = tok.encode_chat_completion(
+ ChatCompletionRequest(
+ messages=[
+ UserMessage(
+ content=[
+ TextChunk(text="Describe the images"),
+ ] + [ImageChunk(image=img) for img in IMG_URLS]
+ )
+ ],
+ model="pixtral",
+ )
+)
+assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS
+"""
+
+OLD_KEY_TO_NEW_KEY_MAPPING = {
+ # Layer Normalization Weights
+ r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight": r"vision_tower.transformer.layers.\1.attention_norm.weight",
+ r"vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"vision_tower.transformer.layers.\1.ffn_norm.weight",
+ # Self Attention Projections
+ r"vision_encoder.transformer.layers.(\d+).attention.wq.weight": r"vision_tower.transformer.layers.\1.attention.q_proj.weight",
+ r"vision_encoder.transformer.layers.(\d+).attention.wk.weight": r"vision_tower.transformer.layers.\1.attention.k_proj.weight",
+ r"vision_encoder.transformer.layers.(\d+).attention.wv.weight": r"vision_tower.transformer.layers.\1.attention.v_proj.weight",
+ r"vision_encoder.transformer.layers.(\d+).attention.wo.weight": r"vision_tower.transformer.layers.\1.attention.o_proj.weight",
+ # MLP Projections
+ r"vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight",
+ r"vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.feed_forward.down_proj.weight",
+ r"vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.feed_forward.up_proj.weight",
+ # Additional mappings
+ r"vision_encoder": r"vision_tower",
+ r"vision_language_adapter.w_in": r"multi_modal_projector.linear_1",
+ r"vision_language_adapter.w_out": r"multi_modal_projector.linear_2",
+ r"layers.(\d+).attention.wq.weight": r"language_model.model.layers.\1.self_attn.q_proj.weight",
+ r"layers.(\d+).attention.wk.weight": r"language_model.model.layers.\1.self_attn.k_proj.weight",
+ r"layers.(\d+).attention.wv.weight": r"language_model.model.layers.\1.self_attn.v_proj.weight",
+ r"layers.(\d+).attention.wo.weight": r"language_model.model.layers.\1.self_attn.o_proj.weight",
+ r"layers.(\d+).feed_forward.w1.weight": r"language_model.model.layers.\1.mlp.gate_proj.weight",
+ r"layers.(\d+).feed_forward.w2.weight": r"language_model.model.layers.\1.mlp.down_proj.weight",
+ r"layers.(\d+).feed_forward.w3.weight": r"language_model.model.layers.\1.mlp.up_proj.weight",
+ r"layers.(\d+).ffn_norm.weight": r"language_model.model.layers.\1.post_attention_layernorm.weight",
+ r"layers.(\d+).attention_norm.weight": r"language_model.model.layers.\1.input_layernorm.weight",
+ r"tok_embeddings.weight": r"language_model.model.embed_tokens.weight",
+ r"output.weight": r"language_model.lm_head.weight",
+ r"norm.weight": r"language_model.model.norm.weight",
+}
+
+
+class MistralConverter:
+ """
+ A general tiktoken converter.
+ """
+
+ def __init__(
+ self,
+ vocab=None,
+ pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
+ add_prefix_space=False,
+ additional_special_tokens=None,
+ *args,
+ **kwargs,
+ ):
+ super().__init__(*args)
+ self.vocab = vocab
+ self.pattern = pattern
+ self.add_prefix_space = add_prefix_space
+ self.additional_special_tokens = additional_special_tokens
+
+ def extract_vocab_merges_from_model(self, vocab: str):
+ bpe_ranks = vocab
+ byte_encoder = bytes_to_unicode()
+
+ def token_bytes_to_string(b):
+ return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
+
+ merges = []
+ vocab = {}
+ for idx, (token, rank) in enumerate(bpe_ranks.items()):
+ if token not in self.additional_special_tokens:
+ vocab[token_bytes_to_string(token)] = idx
+ if len(token) == 1:
+ continue
+ local = []
+ for index in range(1, len(token)):
+ piece_l, piece_r = token[:index], token[index:]
+ if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks:
+ local.append((piece_l, piece_r, rank))
+ local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False)
+ merges.extend(local)
+ else:
+ vocab[token] = idx
+ merges = sorted(merges, key=lambda val: val[2], reverse=False)
+ merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges]
+ return vocab, merges
+
+ def tokenizer(self):
+ vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab)
+ tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False))
+ if hasattr(tokenizer.model, "ignore_merges"):
+ tokenizer.model.ignore_merges = True
+ return tokenizer
+
+ def converted(self) -> Tokenizer:
+ tokenizer = self.tokenizer()
+ tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+ [
+ pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False),
+ pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False),
+ ]
+ )
+ tokenizer.decoder = decoders.ByteLevel()
+ tokenizer.add_special_tokens(self.additional_special_tokens)
+
+ tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+
+ return tokenizer
+
+
+def convert_mistral_tokenizer():
+ model_name = "mistralai/Pixtral-12B-2409"
+
+ tokenizer = MistralTokenizer.from_model(model_name)
+
+ vocab = tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial
+ all_special = [
+ token.value if hasattr(token, "value") else token
+ for token in tokenizer.instruct_tokenizer.tokenizer._all_special_tokens
+ ]
+ specials_tokens = {token: all_special.index(token) for token in all_special}
+ specials_tokens.update(vocab)
+ vocab = specials_tokens
+
+ tokenizer = PreTrainedTokenizerFast(
+ tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(),
+ bos_token="",
+ unk_token="",
+ eos_token="",
+ )
+ tokenizer.model_input_names = ["input_ids", "attention_mask"]
+
+ return tokenizer
+
+
+def permute_for_rope(value, n_heads, config):
+ dim1 = value.shape[0]
+ dim2 = config.hidden_size
+ return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+
+def convert_dictionnary(original_state_dict, vision_config, text_config):
+ new_dict = {}
+
+ all_keys = "\n" + "\n".join(original_state_dict.keys())
+ old_keys = all_keys
+ for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items():
+ all_keys = re.sub(r"\n" + old, r"\n" + new, all_keys)
+
+ OLD_TO_NEW = dict(zip(old_keys.split("\n"), all_keys.split("\n")))
+
+ for key, value in original_state_dict.items():
+ new_key = OLD_TO_NEW[key]
+ if "vision_encoder" in key:
+ _config = vision_config
+ num_attention_heads = _config.num_attention_heads
+ else:
+ _config = text_config
+ if "q_proj" in new_key:
+ num_attention_heads = _config.num_attention_heads
+ if "k_proj" in new_key:
+ num_attention_heads = _config.num_key_value_heads
+ # convert the text model (basically mistral model)
+
+ if "q_proj" in new_key or "k_proj" in new_key:
+ value = permute_for_rope(value, num_attention_heads, _config)
+
+ new_dict[new_key] = value
+ return new_dict
+
+
+def convert_mistral_model(input_dir, output_dir):
+ text_config = MistralConfig(
+ attention_dropout=0.0,
+ bos_token_id=1,
+ eos_token_id=2,
+ head_dim=128,
+ hidden_act="silu",
+ hidden_size=5120,
+ initializer_range=0.02,
+ intermediate_size=14336,
+ max_position_embeddings=1024000,
+ model_type="mistral",
+ num_attention_heads=32,
+ num_hidden_layers=40,
+ num_key_value_heads=8,
+ rms_norm_eps=1e-05,
+ rope_theta=1000000000.0,
+ sliding_window=None,
+ tie_word_embeddings=False,
+ vocab_size=131072,
+ )
+
+ vision_config = PixtralVisionConfig()
+ config = LlavaConfig(
+ vision_config,
+ text_config,
+ vision_feature_layer=-1,
+ image_token_index=10,
+ vision_feature_select_strategy="full",
+ image_seq_length=1,
+ )
+ config.architectures = ["LlavaForConditionalGeneration"]
+ config.save_pretrained(output_dir)
+
+ original_state_dict = safe_load_file(f"{input_dir}/consolidated.safetensors")
+ new_dict = convert_dictionnary(original_state_dict, vision_config, text_config)
+
+ with torch.device("meta"):
+ model = LlavaForConditionalGeneration(config)
+ model.load_state_dict(new_dict, strict=True, assign=True)
+
+ model.save_pretrained(output_dir)
+
+ tokenizer = convert_mistral_tokenizer()
+ image_processor = PixtralImageProcessor()
+ processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
+ processor.save_pretrained(output_dir)
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--input_dir",
+ help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+ )
+ parser.add_argument(
+ "--output_dir",
+ help="Location to write HF model and tokenizer",
+ )
+
+ args = parser.parse_args()
+ convert_mistral_model(args.input_dir, args.output_dir)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
new file mode 100644
index 00000000000000..c6d18420bec575
--- /dev/null
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -0,0 +1,519 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Pixtral."""
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+ resize,
+ to_channel_dimension_format,
+)
+from ...image_utils import (
+ ChannelDimension,
+ ImageInput,
+ PILImageResampling,
+ get_image_size,
+ infer_channel_dimension_format,
+ is_scaled_image,
+ is_valid_image,
+ to_numpy_array,
+ valid_images,
+ validate_kwargs,
+ validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, is_vision_available, logging
+from ...utils.import_utils import requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+ import PIL
+
+
+class BatchMixFeature(BatchFeature):
+ def to(self, *args, **kwargs) -> "BatchMixFeature":
+ """
+ Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
+ different `dtypes` and sending the `BatchFeature` to a different `device`.
+
+ Args:
+ args (`Tuple`):
+ Will be passed to the `to(...)` function of the tensors.
+ kwargs (`Dict`, *optional*):
+ Will be passed to the `to(...)` function of the tensors.
+
+ Returns:
+ [`BatchFeature`]: The same instance after modification.
+ """
+ requires_backends(self, ["torch"])
+ import torch # noqa
+
+ new_data = {}
+ device = kwargs.get("device")
+ # Check if the args are a device or a dtype
+ if device is None and len(args) > 0:
+ # device should be always the first argument
+ arg = args[0]
+ if is_torch_dtype(arg):
+ # The first argument is a dtype
+ pass
+ elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+ device = arg
+ else:
+ # it's something else
+ raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+ # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+ for k, v in self.items():
+ # check if v is a floating point
+ if isinstance(v, list):
+ new_data[k] = [
+ element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
+ ]
+ elif torch.is_floating_point(v):
+ # cast and send to device
+ new_data[k] = v.to(*args, **kwargs)
+ elif device is not None:
+ new_data[k] = v.to(device=device)
+ else:
+ new_data[k] = v
+ self.data = new_data
+ return self
+
+
+# Copied from transformers.models.idefics2.image_processing_idefics2.make_list_of_images
+def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
+ """
+ Convert a single image or a list of images to a list of numpy arrays.
+
+ Args:
+ images (`ImageInput`):
+ A single image or a list of images.
+
+ Returns:
+ A list of numpy arrays.
+ """
+ # If it's a single image, convert it to a list of lists
+ if is_valid_image(images):
+ images = [[images]]
+ # If it's a list of images, it's a single batch, so convert it to a list of lists
+ elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]):
+ images = [images]
+ # If it's a list of batches, it's already in the right format
+ elif (
+ isinstance(images, (list, tuple))
+ and len(images) > 0
+ and isinstance(images[0], (list, tuple))
+ and is_valid_image(images[0][0])
+ ):
+ pass
+ else:
+ raise ValueError(
+ "Invalid input type. Must be a single image, a list of images, or a list of batches of images."
+ )
+ return images
+
+
+# Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.
+def convert_to_rgb(image: ImageInput) -> ImageInput:
+ """
+ Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+ as is.
+ Args:
+ image (Image):
+ The image to convert.
+ """
+ requires_backends(convert_to_rgb, ["vision"])
+
+ if not isinstance(image, PIL.Image.Image):
+ return image
+
+ if image.mode == "RGB":
+ return image
+
+ # First we convert to RGBA to set background to white.
+ image = image.convert("RGBA")
+
+ # Create a new image with a white background.
+ new_image = PIL.Image.new("RGBA", image.size, "WHITE")
+ new_image.paste(image, (0, 0), image)
+ new_image = new_image.convert("RGB")
+ return new_image
+
+
+def _num_image_tokens(image_size: Tuple[int, int], patch_size: Tuple[int, int]) -> int:
+ """
+ Calculate the number of image tokens given the image size and patch size.
+
+ Args:
+ image_size (`Tuple[int, int]`):
+ The size of the image as `(height, width)`.
+ patch_size (`Tuple[int, int]`):
+ The patch size as `(height, width)`.
+
+ Returns:
+ `int`: The number of image tokens.
+ """
+ height, width = image_size
+ patch_height, patch_width = patch_size if isinstance(patch_size, (tuple, list)) else (patch_size, patch_size)
+ num_width_tokens = (width - 1) // patch_width + 1
+ num_height_tokens = (height - 1) // patch_height + 1
+ return num_height_tokens, num_width_tokens
+
+
+def get_resize_output_image_size(
+ input_image: np.ndarray,
+ size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+ patch_size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple:
+ """
+ Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+ size.
+
+ Args:
+ input_image (`np.ndarray`):
+ The image to resize.
+ size (`int` or `Tuple[int, int]`):
+ Max image size an input image can be. Must be a dictionary with the key "longest_edge".
+ patch_size (`int` or `Tuple[int, int]`):
+ The patch_size as `(height, width)` to use for resizing the image. If patch_size is an integer, `(patch_size, patch_size)`
+ will be used
+ input_data_format (`ChannelDimension`, *optional*):
+ The channel dimension format of the input image. If unset, will use the inferred format from the input.
+
+ Returns:
+ `tuple`: The target (height, width) dimension of the output image after resizing.
+ """
+ max_height, max_width = size if isinstance(size, (tuple, list)) else (size, size)
+ patch_height, patch_width = patch_size if isinstance(patch_size, (tuple, list)) else (patch_size, patch_size)
+ height, width = get_image_size(input_image, input_data_format)
+
+ ratio = max(height / max_height, width / max_width)
+
+ if ratio > 1:
+ # Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results
+ height = int(np.ceil(height / ratio))
+ width = int(np.ceil(width / ratio))
+
+ num_height_tokens, num_width_tokens = _num_image_tokens((height, width), (patch_height, patch_width))
+ return num_height_tokens * patch_height, num_width_tokens * patch_width
+
+
+# Hack to get tensor conversion used in BatchFeature without batching the images
+def _get_is_as_tensor_fns(tensor_type: Union[str, TensorType]) -> Tuple[Callable, Callable]:
+ return BatchFeature()._get_is_as_tensor_fns(tensor_type)
+
+
+def convert_to_tensor(array, tensor_type: Union[str, TensorType]) -> Any:
+ is_tensor, as_tensor = _get_is_as_tensor_fns(tensor_type)
+ if is_tensor(array):
+ return array
+ return as_tensor(array)
+
+
+class PixtralImageProcessor(BaseImageProcessor):
+ r"""
+ Constructs a Pixtral image processor.
+
+ Args:
+ do_resize (`bool`, *optional*, defaults to `True`):
+ Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+ `do_resize` in the `preprocess` method.
+ size (`Dict[str, int]` *optional*, defaults to `{"longest_edge": 1024}`):
+ Size of the maximum dimension of either the height or width dimension of the image. Used to control how
+ images are resized. If either the height or width are greater than `size["longest_edge"]` then both the height and width are rescaled by `height / ratio`, `width /ratio` where `ratio = max(height / longest_edge, width / longest_edge)`
+ patch_size (`Dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
+ Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+ Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+ do_rescale (`bool`, *optional*, defaults to `True`):
+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+ the `preprocess` method.
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+ Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+ method.
+ do_normalize (`bool`, *optional*, defaults to `True`):
+ Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+ Can be overridden by the `image_std` parameter in the `preprocess` method.
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
+ Whether to convert the image to RGB.
+ """
+
+ model_input_names = ["pixel_values"]
+
+ def __init__(
+ self,
+ do_resize: bool = True,
+ size: Dict[str, int] = None,
+ patch_size: Dict[str, int] = None,
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
+ do_rescale: bool = True,
+ rescale_factor: Union[int, float] = 1 / 255,
+ do_normalize: bool = True,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ do_convert_rgb: bool = True,
+ **kwargs,
+ ) -> None:
+ super().__init__(**kwargs)
+ size = size if size is not None else {"longest_edge": 1024}
+ patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+ patch_size = get_size_dict(patch_size, default_to_square=True)
+
+ self.do_resize = do_resize
+ self.size = size
+ self.patch_size = patch_size
+ self.resample = resample
+ self.do_rescale = do_rescale
+ self.rescale_factor = rescale_factor
+ self.do_normalize = do_normalize
+ self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+ self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+ self.do_convert_rgb = do_convert_rgb
+ self._valid_processor_keys = [
+ "images",
+ "do_resize",
+ "size",
+ "patch_size",
+ "resample",
+ "do_rescale",
+ "rescale_factor",
+ "do_normalize",
+ "image_mean",
+ "image_std",
+ "do_convert_rgb",
+ "return_tensors",
+ "data_format",
+ "input_data_format",
+ ]
+
+ def resize(
+ self,
+ image: np.ndarray,
+ size: Dict[str, int],
+ patch_size: Dict[str, int],
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
+ data_format: Optional[Union[str, ChannelDimension]] = None,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ **kwargs,
+ ) -> np.ndarray:
+ """
+ Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+ resized to keep the input aspect ratio.
+
+ Args:
+ image (`np.ndarray`):
+ Image to resize.
+ size (`Dict[str, int]`):
+ Dict containing the longest possible edge of the image.
+ patch_size (`Dict[str, int]`):
+ Patch size used to calculate the size of the output image.
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+ Resampling filter to use when resiizing the image.
+ data_format (`str` or `ChannelDimension`, *optional*):
+ The channel dimension format of the image. If not provided, it will be the same as the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format of the input image. If not provided, it will be inferred.
+ """
+ if "longest_edge" in size:
+ size = (size["longest_edge"], size["longest_edge"])
+ elif "height" in size and "width" in size:
+ size = (size["height"], size["width"])
+ else:
+ raise ValueError("size must contain either 'longest_edge' or 'height' and 'width'.")
+
+ if "height" in patch_size and "width" in patch_size:
+ patch_size = (patch_size["height"], patch_size["width"])
+ else:
+ raise ValueError("patch_size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+ output_size = get_resize_output_image_size(
+ image,
+ size=size,
+ patch_size=patch_size,
+ input_data_format=input_data_format,
+ )
+ return resize(
+ image,
+ size=output_size,
+ resample=resample,
+ data_format=data_format,
+ input_data_format=input_data_format,
+ **kwargs,
+ )
+
+ def preprocess(
+ self,
+ images: ImageInput,
+ do_resize: bool = None,
+ size: Dict[str, int] = None,
+ patch_size: Dict[str, int] = None,
+ resample: PILImageResampling = None,
+ do_rescale: bool = None,
+ rescale_factor: float = None,
+ do_normalize: bool = None,
+ image_mean: Optional[Union[float, List[float]]] = None,
+ image_std: Optional[Union[float, List[float]]] = None,
+ do_convert_rgb: bool = None,
+ return_tensors: Optional[Union[str, TensorType]] = None,
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
+ **kwargs,
+ ) -> PIL.Image.Image:
+ """
+ Preprocess an image or batch of images.
+
+ Args:
+ images (`ImageInput`):
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+ Whether to resize the image.
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+ Describes the maximum input dimensions to the model.
+ patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
+ Patch size in the model. Used to calculate the image after resizing.
+ resample (`int`, *optional*, defaults to `self.resample`):
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+ has an effect if `do_resize` is set to `True`.
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+ Whether to rescale the image.
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+ Whether to normalize the image.
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+ `True`.
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+ Whether to convert the image to RGB.
+ return_tensors (`str` or `TensorType`, *optional*):
+ The type of tensors to return. Can be one of:
+ - Unset: Return a list of `np.ndarray`.
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+ The channel dimension format for the output image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - Unset: Use the channel dimension format of the input image.
+ input_data_format (`ChannelDimension` or `str`, *optional*):
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
+ from the input image. Can be one of:
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+ """
+ patch_size = patch_size if patch_size is not None else self.patch_size
+ patch_size = get_size_dict(patch_size, default_to_square=True)
+
+ do_resize = do_resize if do_resize is not None else self.do_resize
+ size = size if size is not None else self.size
+ resample = resample if resample is not None else self.resample
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+ image_mean = image_mean if image_mean is not None else self.image_mean
+ image_std = image_std if image_std is not None else self.image_std
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+ validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+ images_list = make_list_of_images(images)
+
+ if not valid_images(images_list[0]):
+ raise ValueError(
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+ "torch.Tensor, tf.Tensor or jax.ndarray."
+ )
+
+ validate_preprocess_arguments(
+ do_rescale=do_rescale,
+ rescale_factor=rescale_factor,
+ do_normalize=do_normalize,
+ image_mean=image_mean,
+ image_std=image_std,
+ do_resize=do_resize,
+ size=size,
+ resample=resample,
+ )
+
+ if do_convert_rgb:
+ images_list = [[convert_to_rgb(image) for image in images] for images in images_list]
+
+ # All transformations expect numpy arrays.
+ images_list = [[to_numpy_array(image) for image in images] for images in images_list]
+
+ if is_scaled_image(images_list[0][0]) and do_rescale:
+ logger.warning_once(
+ "It looks like you are trying to rescale already rescaled images. If the input"
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+ )
+
+ if input_data_format is None:
+ # We assume that all images have the same channel dimension format.
+ input_data_format = infer_channel_dimension_format(images_list[0][0])
+
+ batch_images = []
+ batch_image_sizes = []
+ for sample_images in images_list:
+ images = []
+ image_sizes = []
+ for image in sample_images:
+ if do_resize:
+ image = self.resize(
+ image=image,
+ size=size,
+ patch_size=patch_size,
+ resample=resample,
+ input_data_format=input_data_format,
+ )
+
+ if do_rescale:
+ image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+ if do_normalize:
+ image = self.normalize(
+ image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+ )
+
+ images.append(image)
+ image_sizes.append(get_image_size(image, input_data_format))
+ batch_images.append(images)
+ batch_image_sizes.append(image_sizes)
+
+ images_list = [
+ [to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images]
+ for images in batch_images
+ ]
+
+ # Convert to tensor type outside of BatchFeature to avoid batching the images of different sizes
+ images_list = [[convert_to_tensor(image, return_tensors) for image in images] for images in images_list]
+ return BatchMixFeature(data={"pixel_values": images_list, "image_sizes": batch_image_sizes}, tensor_type=None)
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
new file mode 100644
index 00000000000000..0e10c78b7852af
--- /dev/null
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -0,0 +1,517 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Pixtral model."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ... import PreTrainedModel
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput
+from ...utils import (
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+)
+from .configuration_pixtral import PixtralVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def position_ids_in_meshgrid(patch_embeds_list, max_width):
+ positions = []
+ for patch in patch_embeds_list:
+ height, width = patch.shape[-2:]
+ mesh = torch.meshgrid(torch.arange(height), torch.arange(width), indexing="ij")
+ h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2, -1)
+ ids = h_grid * max_width + v_grid
+ positions.append(ids[:, 0])
+ return torch.cat(positions)
+
+
+class PixtralRotaryEmbedding(nn.Module):
+ """
+ The key with pixtral embedding is just that you have a frequency for each pixel positions.
+ If you have height x width pixels (or embedding pixels)
+
+ then the frequency used for ROPE is given by indexing the pre_computed frequency on the
+ width and height.
+
+ What you output is of dimension batch, height * width, dim with dim the embed dim.
+
+ This simply means that for each image hidden states, you are going to add
+ a corresponding positional embedding, based on it's index in the grid.
+ """
+
+ def __init__(self, config, device):
+ super().__init__()
+ self.rope_type = "default"
+ self.dim = config.head_dim
+ self.base = config.rope_theta
+ max_patches_per_side = config.image_size // config.patch_size
+ freqs = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
+
+ h = torch.arange(max_patches_per_side, device=freqs.device)
+ w = torch.arange(max_patches_per_side, device=freqs.device)
+
+ freqs_h = torch.outer(h, freqs[::2]).float()
+ freqs_w = torch.outer(w, freqs[1::2]).float()
+ inv_freq = torch.cat(
+ [
+ freqs_h[:, None, :].repeat(1, max_patches_per_side, 1),
+ freqs_w[None, :, :].repeat(max_patches_per_side, 1, 1),
+ ],
+ dim=-1,
+ ).reshape(-1, self.dim // 2) # we reshape to only index on the position indexes, not tuple of indexes
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
+
+ # TODO maybe make it torch compatible later on. We can also just slice
+ self.register_buffer("inv_freq", torch.cat((inv_freq, inv_freq), dim=-1), persistent=False)
+
+ @torch.no_grad()
+ def forward(self, x, position_ids):
+ if "dynamic" in self.rope_type:
+ self._dynamic_frequency_update(position_ids, device=x.device)
+
+ # Core RoPE block
+ freqs = self.inv_freq[position_ids]
+ # position_ids_expanded = position_ids[:, None, :].float()
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+ device_type = x.device.type
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False):
+ emb = freqs
+ cos = emb.cos()
+ sin = emb.sin()
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+ def _dynamic_frequency_update(self, position_ids, device):
+ """
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
+ 1 - growing beyond the cached sequence length (allow scaling)
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+ """
+ seq_len = torch.max(position_ids) + 1
+ if seq_len > self.max_seq_len_cached: # growth
+ inv_freq, self.attention_scaling = self.rope_init_fn(
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
+ )
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
+ self.max_seq_len_cached = seq_len
+
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+ self.max_seq_len_cached = self.original_max_seq_len
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`, *optional*):
+ Deprecated and unused.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class PixtralAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.embed_dim = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = self.embed_dim // self.num_heads
+
+ self.scale = self.head_dim**-0.5
+ self.dropout = config.attention_dropout
+
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+ self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_embeddings: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+ """Input shape: Batch x Time x Channel"""
+
+ batch_size, patches, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+
+ cos, sin = position_embeddings
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=0)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+ if attention_mask is not None:
+ attn_weights = attn_weights + attention_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.reshape(batch_size, patches, -1)
+
+ attn_output = self.o_proj(attn_output)
+
+ return attn_output, attn_weights
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Pixtral
+class PixtralMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, hidden_state):
+ return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Pixtral
+class PixtralRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ PixtralRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+ def extra_repr(self):
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class PixtralAttentionLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.attention_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+ self.feed_forward = PixtralMLP(config)
+ self.attention = PixtralAttention(config)
+ self.ffn_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ position_embeddings: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = False,
+ ) -> Tuple[torch.FloatTensor]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`):
+ Input to the layer of shape `(batch, seq_len, embed_dim)`.
+ attention_mask (`torch.FloatTensor`):
+ Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+
+ hidden_states = self.attention_norm(hidden_states)
+ hidden_states, attn_weights = self.attention(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_embeddings=position_embeddings,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + hidden_states
+
+ residual = hidden_states
+ hidden_states = self.ffn_norm(hidden_states)
+ hidden_states = self.feed_forward(hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+ return outputs
+
+
+class PixtralTransformer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.layers = torch.nn.ModuleList()
+ for _ in range(config.num_hidden_layers):
+ self.layers.append(PixtralAttentionLayer(config))
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ inputs_embeds,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_embeddings: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutput]:
+ r"""
+ Args:
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ hidden_states = inputs_embeds
+ for encoder_layer in self.layers:
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ encoder_layer.__call__,
+ hidden_states,
+ attention_mask,
+ position_embeddings,
+ output_attentions,
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ position_embeddings=position_embeddings,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=[hidden_states], attentions=all_attentions
+ )
+
+
+PIXTRAL_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`PixtralVisionConfig`] or [`PixtralVisionConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+ PIXTRAL_START_DOCSTRING,
+)
+class PixtralPreTrainedModel(PreTrainedModel):
+ config_class = PixtralVisionConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["PixtralVisionAttention"]
+ _skip_keys_device_placement = "past_key_values"
+ _supports_cache_class = True
+
+ def _init_weights(self, module):
+ # important: this ported version of Pixtral isn't meant for training from scratch - only
+ # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+ # https://github.com/haotian-liu/LLaVA/tree/main/pixtral should serve for that purpose
+ std = (
+ self.config.initializer_range
+ if hasattr(self.config, "initializer_range")
+ else self.config.text_config.initializer_range
+ )
+
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+
+PIXTRAL_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values: list of N_img images of variable sizes,
+ each of shape (C, H, W)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def generate_block_attention_mask(patch_embeds_list, tensor):
+ dtype = tensor.dtype
+ device = tensor.device
+ seq_len = tensor.shape[1]
+ d_min = torch.finfo(dtype).min
+ causal_mask = torch.full((seq_len, seq_len), fill_value=d_min, dtype=dtype, device=device)
+
+ block_end_idx = torch.tensor(patch_embeds_list).cumsum(-1)
+ block_start_idx = torch.tensor([0] + patch_embeds_list[:-1]).cumsum(-1)
+ for start, end in zip(block_start_idx, block_end_idx):
+ causal_mask[start:end, start:end] = 0
+
+ causal_mask = causal_mask[None, None, :, :].expand(tensor.shape[0], 1, -1, -1)
+ return causal_mask
+
+
+@add_start_docstrings(
+ """The PIXTRAL model which consists of a vision backbone and a language model.""",
+ PIXTRAL_START_DOCSTRING,
+)
+class PixtralModel(PixtralPreTrainedModel):
+ base_model_prefix = "vision_encoder"
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.config = config
+ self.patch_conv = nn.Conv2d(
+ in_channels=config.num_channels,
+ out_channels=config.hidden_size,
+ kernel_size=config.patch_size,
+ stride=config.patch_size,
+ bias=False,
+ )
+ self.ln_pre = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+ self.transformer = PixtralTransformer(config)
+ self.patch_positional_embedding = PixtralRotaryEmbedding(config, device=self.device)
+
+ @add_start_docstrings_to_model_forward(PIXTRAL_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ pixel_values: List[torch.Tensor],
+ output_hidden_states: Optional[bool] = False,
+ output_attentions: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ *args,
+ **kwargs,
+ ) -> Union[Tuple, BaseModelOutput]:
+ """
+ Returns:
+ pixel_values: tensor of token features for
+ all tokens of all images of shape (N_toks, D)
+ """
+ # pass images through initial convolution independently
+ patch_embeds_list = [self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in pixel_values]
+
+ # flatten to a single sequence
+ patch_embeds = torch.cat([p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
+ patch_embeds = self.ln_pre(patch_embeds)
+
+ # positional embeddings
+ position_ids = position_ids_in_meshgrid(
+ patch_embeds_list, max_width=self.config.image_size // self.config.patch_size
+ ).to(self.device)
+
+ position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
+ attention_mask = generate_block_attention_mask(
+ [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
+ )
+ return self.transformer(patch_embeds, attention_mask, position_embedding)
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
new file mode 100644
index 00000000000000..9362703c8aa6da
--- /dev/null
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -0,0 +1,282 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Pixtral.
+"""
+
+from typing import List, Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, is_valid_image, load_image
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, logging, requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.idefics2.processing_idefics2.is_url
+def is_url(val) -> bool:
+ return isinstance(val, str) and val.startswith("http")
+
+
+# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
+def is_image_or_image_url(elem):
+ return is_url(elem) or is_valid_image(elem)
+
+
+# Copied from transformers.models.pixtral.image_processing_pixtral.BatchMixFeature
+class BatchMixFeature(BatchFeature):
+ def to(self, *args, **kwargs) -> "BatchMixFeature":
+ """
+ Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
+ different `dtypes` and sending the `BatchFeature` to a different `device`.
+
+ Args:
+ args (`Tuple`):
+ Will be passed to the `to(...)` function of the tensors.
+ kwargs (`Dict`, *optional*):
+ Will be passed to the `to(...)` function of the tensors.
+
+ Returns:
+ [`BatchFeature`]: The same instance after modification.
+ """
+ requires_backends(self, ["torch"])
+ import torch # noqa
+
+ new_data = {}
+ device = kwargs.get("device")
+ # Check if the args are a device or a dtype
+ if device is None and len(args) > 0:
+ # device should be always the first argument
+ arg = args[0]
+ if is_torch_dtype(arg):
+ # The first argument is a dtype
+ pass
+ elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+ device = arg
+ else:
+ # it's something else
+ raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+ # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+ for k, v in self.items():
+ # check if v is a floating point
+ if isinstance(v, list):
+ new_data[k] = [
+ element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
+ ]
+ elif torch.is_floating_point(v):
+ # cast and send to device
+ new_data[k] = v.to(*args, **kwargs)
+ elif device is not None:
+ new_data[k] = v.to(device=device)
+ else:
+ new_data[k] = v
+ self.data = new_data
+ return self
+
+
+class PixtralProcessor(ProcessorMixin):
+ r"""
+ Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor.
+
+ [`PixtralProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
+ [`~PixtralProcessor.__call__`] and [`~PixtralProcessor.decode`] for more information.
+
+ Args:
+ image_processor ([`PixtralImageProcessor`], *optional*):
+ The image processor is a required input.
+ tokenizer ([`LlamaTokenizerFast`], *optional*):
+ The tokenizer is a required input.
+ patch_size (`int`, *optional*, defaults to 16):
+ Patch size from the vision tower.
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+ in a chat into a tokenizable string.
+ image_token (`str`, *optional*, defaults to `"[IMG]"`):
+ Special token used to denote image location.
+ image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
+ Special token used to denote the end of a line of pixels in an image.
+ image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
+ Special token used to denote the end of an image input.
+ """
+
+ attributes = ["image_processor", "tokenizer"]
+ valid_kwargs = [
+ "chat_template",
+ "patch_size",
+ "image_token",
+ "image_break_token",
+ "image_end_token",
+ ]
+ image_processor_class = "AutoImageProcessor"
+ tokenizer_class = "AutoTokenizer"
+
+ def __init__(
+ self,
+ image_processor=None,
+ tokenizer=None,
+ patch_size: int = 16,
+ chat_template=None,
+ image_token="[IMG]", # set the default and let users change if they have peculiar special tokens in rare cases
+ image_break_token="[IMG_BREAK]",
+ image_end_token="[IMG_END]",
+ **kwargs,
+ ):
+ self.patch_size = patch_size
+ self.image_token = image_token
+ self.image_break_token = image_break_token
+ self.image_end_token = image_end_token
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+ def __call__(
+ self,
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+ images: ImageInput = None,
+ padding: Union[bool, str, PaddingStrategy] = False,
+ truncation: Union[bool, str, TruncationStrategy] = None,
+ max_length=None,
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+ ) -> BatchMixFeature:
+ """
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+ and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+ of the above two methods for more information.
+
+ Args:
+ text (`str`, `List[str]`, `List[List[str]]`):
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. Both channels-first and channels-last formats are supported.
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
+ index) among:
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+ sequence if provided).
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+ acceptable input length for the model if that argument is not provided.
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+ lengths).
+ max_length (`int`, *optional*):
+ Maximum length of the returned list and optionally padding length (see above).
+ truncation (`bool`, *optional*):
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
+ If set, will return tensors of a particular framework. Acceptable values are:
+
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+ Returns:
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+ `None`).
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+ """
+ if images is not None:
+ if is_image_or_image_url(images):
+ images = [[images]]
+ elif isinstance(images, list) and is_image_or_image_url(images[0]):
+ images = [images]
+ elif (
+ not isinstance(images, list)
+ and not isinstance(images[0], list)
+ and not is_image_or_image_url(images[0][0])
+ ):
+ raise ValueError(
+ "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+ )
+ images = [[load_image(im) for im in sample] for sample in images]
+ image_inputs = self.image_processor(images, patch_size=self.patch_size, return_tensors=return_tensors)
+ else:
+ image_inputs = {}
+
+ if isinstance(text, str):
+ text = [text]
+ elif not isinstance(text, list) and not isinstance(text[0], str):
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+ # try to expand inputs in processing if we have the necessary parts
+ prompt_strings = text
+ if image_inputs.get("pixel_values") is not None:
+ # Replace the image token with the expanded image token sequence
+ images = image_inputs["pixel_values"]
+ image_sizes = image_inputs.pop("image_sizes")
+ prompt_strings = []
+
+ for sample_images, sample_image_sizes, sample in zip(images, image_sizes, text):
+ replace_strings = []
+ # First calculate the number of tokens needed for each image and put in a placeholder
+ for image, image_size in zip(sample_images, sample_image_sizes):
+ height, width = image_size
+ num_height_tokens = height // self.patch_size
+ num_width_tokens = width // self.patch_size
+ replace_tokens = [
+ [self.image_token] * num_width_tokens + [self.image_break_token]
+ ] * num_height_tokens
+ # Flatten list
+ replace_tokens = [item for sublist in replace_tokens for item in sublist]
+ replace_tokens[-1] = self.image_end_token
+ replace_str = "".join(replace_tokens)
+ replace_strings.append(replace_str)
+ sample = sample.replace(self.image_token, "", 1)
+
+ while "" in sample:
+ replace_str = replace_strings.pop(0)
+ sample = sample.replace("", replace_str, 1)
+
+ prompt_strings.append(sample)
+
+ text_inputs = self.tokenizer(
+ prompt_strings,
+ return_tensors=return_tensors,
+ padding=padding,
+ truncation=truncation,
+ max_length=max_length,
+ )
+ return BatchMixFeature(data={**text_inputs, **image_inputs})
+
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+ def batch_decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+ refer to the docstring of this method for more information.
+ """
+ return self.tokenizer.batch_decode(*args, **kwargs)
+
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+ def decode(self, *args, **kwargs):
+ """
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+ the docstring of this method for more information.
+ """
+ return self.tokenizer.decode(*args, **kwargs)
+
+ @property
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+ def model_input_names(self):
+ tokenizer_input_names = self.tokenizer.model_input_names
+ image_processor_input_names = self.image_processor.model_input_names
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index b9ce0d0f15bbf5..2db7b38b580375 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -7067,6 +7067,20 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
+class PixtralModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class PixtralPreTrainedModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
class PLBartForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 2493954a518b2c..436378582e54ca 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -506,6 +506,13 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])
+class PixtralImageProcessor(metaclass=DummyObject):
+ _backends = ["vision"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["vision"])
+
+
class PoolFormerFeatureExtractor(metaclass=DummyObject):
_backends = ["vision"]
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 2fed802b5a2fb3..5c05480ffa6dbb 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -569,3 +569,50 @@ def test_expansion_in_processing(self):
# check that both inputs are handled correctly and generate the same output
self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
+
+ @slow
+ @require_bitsandbytes
+ def test_pixtral(self):
+ model_id = "hf-internal-testing/pixtral-12b"
+ model = LlavaForConditionalGeneration.from_pretrained(model_id)
+ processor = AutoProcessor.from_pretrained(model_id)
+
+ IMG_URLS = [
+ Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
+ Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
+ Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
+ Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
+ ]
+ PROMPT = "[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
+
+ # image = Image.open(requests.get(url, stream=True).raw)
+ inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
+ generate_ids = model.generate(**inputs, max_new_tokens=500)
+ ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+ # fmt: off
+ EXPECTED_GENERATION = """
+Describe the images.
+Sure, let's break down each image description:
+
+1. **Image 1:**
+ - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
+ - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.
+
+2. **Image 2:**
+ - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
+ - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.
+
+3. **Image 3:**
+ - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
+ - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.
+
+4. **Image 4:**
+ - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
+ - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
+
+Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
+"""
+ # fmt: on
+ # check that both inputs are handled correctly and generate the same output
+ self.assertListEqual(ouptut, EXPECTED_GENERATION)
diff --git a/tests/models/pixtral/__init__.py b/tests/models/pixtral/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/pixtral/test_image_processing_pixtral.py b/tests/models/pixtral/test_image_processing_pixtral.py
new file mode 100644
index 00000000000000..3994201c065c45
--- /dev/null
+++ b/tests/models/pixtral/test_image_processing_pixtral.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+ import torch
+
+if is_vision_available():
+ from PIL import Image
+
+ from transformers import PixtralImageProcessor
+
+
+class PixtralImageProcessingTester(unittest.TestCase):
+ def __init__(
+ self,
+ parent,
+ batch_size=7,
+ num_channels=3,
+ image_size=18,
+ max_num_images_per_sample=3,
+ min_resolution=30,
+ max_resolution=400,
+ do_resize=True,
+ size=None,
+ patch_size=None,
+ do_normalize=True,
+ image_mean=[0.48145466, 0.4578275, 0.40821073],
+ image_std=[0.26862954, 0.26130258, 0.27577711],
+ do_convert_rgb=True,
+ ):
+ size = size if size is not None else {"longest_edge": 24}
+ patch_size = patch_size if patch_size is not None else {"height": 8, "width": 8}
+ self.parent = parent
+ self.batch_size = batch_size
+ self.num_channels = num_channels
+ self.image_size = image_size
+ self.max_num_images_per_sample = max_num_images_per_sample
+ self.min_resolution = min_resolution
+ self.max_resolution = max_resolution
+ self.do_resize = do_resize
+ self.size = size
+ self.patch_size = patch_size
+ self.do_normalize = do_normalize
+ self.image_mean = image_mean
+ self.image_std = image_std
+ self.do_convert_rgb = do_convert_rgb
+
+ def prepare_image_processor_dict(self):
+ return {
+ "do_resize": self.do_resize,
+ "size": self.size,
+ "patch_size": self.patch_size,
+ "do_normalize": self.do_normalize,
+ "image_mean": self.image_mean,
+ "image_std": self.image_std,
+ "do_convert_rgb": self.do_convert_rgb,
+ }
+
+ def expected_output_image_shape(self, image):
+ if isinstance(image, Image.Image):
+ width, height = image.size
+ elif isinstance(image, np.ndarray):
+ height, width = image.shape[:2]
+ elif isinstance(image, torch.Tensor):
+ height, width = image.shape[-2:]
+
+ max_height = max_width = self.size.get("longest_edge")
+
+ ratio = max(height / max_height, width / max_width)
+ if ratio > 1:
+ height = int(np.ceil(height / ratio))
+ width = int(np.ceil(width / ratio))
+
+ patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+ num_height_tokens = (height - 1) // patch_height + 1
+ num_width_tokens = (width - 1) // patch_width + 1
+
+ height = num_height_tokens * patch_height
+ width = num_width_tokens * patch_width
+
+ return self.num_channels, height, width
+
+ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+ # Use prepare_image_inputs to make a list of list of single images
+
+ images_list = []
+ for _ in range(self.batch_size):
+ images = []
+ for _ in range(random.randint(1, self.max_num_images_per_sample)):
+ img = prepare_image_inputs(
+ batch_size=1,
+ num_channels=self.num_channels,
+ min_resolution=self.min_resolution,
+ max_resolution=self.max_resolution,
+ equal_resolution=equal_resolution,
+ numpify=numpify,
+ torchify=torchify,
+ )[0]
+ images.append(img)
+ images_list.append(images)
+ return images_list
+
+
+@require_torch
+@require_vision
+class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+ image_processing_class = PixtralImageProcessor if is_vision_available() else None
+
+ def setUp(self):
+ super().setUp()
+ self.image_processor_tester = PixtralImageProcessingTester(self)
+
+ @property
+ def image_processor_dict(self):
+ return self.image_processor_tester.prepare_image_processor_dict()
+
+ def test_image_processor_properties(self):
+ image_processing = self.image_processing_class(**self.image_processor_dict)
+ self.assertTrue(hasattr(image_processing, "do_resize"))
+ self.assertTrue(hasattr(image_processing, "size"))
+ self.assertTrue(hasattr(image_processing, "patch_size"))
+ self.assertTrue(hasattr(image_processing, "do_rescale"))
+ self.assertTrue(hasattr(image_processing, "rescale_factor"))
+ self.assertTrue(hasattr(image_processing, "do_normalize"))
+ self.assertTrue(hasattr(image_processing, "image_mean"))
+ self.assertTrue(hasattr(image_processing, "image_std"))
+ self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+ def test_call_pil(self):
+ # Initialize image_processing
+ image_processing = self.image_processing_class(**self.image_processor_dict)
+ # create random PIL images
+ image_inputs_list = self.image_processor_tester.prepare_image_inputs()
+ for image_inputs in image_inputs_list:
+ for image in image_inputs:
+ self.assertIsInstance(image, Image.Image)
+
+ # Test not batched input
+ encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
+ expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
+ self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+
+ # Test batched
+ batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
+ for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
+ for encoded_image, image in zip(encoded_images, images):
+ expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
+ self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+
+ def test_call_numpy(self):
+ # Initialize image_processing
+ image_processing = self.image_processing_class(**self.image_processor_dict)
+ # create random numpy tensors
+ image_inputs_list = self.image_processor_tester.prepare_image_inputs(numpify=True)
+ for image_inputs in image_inputs_list:
+ for image in image_inputs:
+ self.assertIsInstance(image, np.ndarray)
+
+ # Test not batched input
+ encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
+ expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
+ self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+
+ # Test batched
+ batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
+ for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
+ for encoded_image, image in zip(encoded_images, images):
+ expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
+ self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+
+ def test_call_pytorch(self):
+ # Initialize image_processing
+ image_processing = self.image_processing_class(**self.image_processor_dict)
+ # create random PyTorch tensors
+ image_inputs_list = self.image_processor_tester.prepare_image_inputs(torchify=True)
+ for image_inputs in image_inputs_list:
+ for image in image_inputs:
+ self.assertIsInstance(image, torch.Tensor)
+
+ # Test not batched input
+ encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
+ expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
+ self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+
+ # Test batched
+ batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
+ for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
+ for encoded_image, image in zip(encoded_images, images):
+ expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
+ self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+
+ @unittest.skip(reason="PixtralImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
+ def test_call_numpy_4_channels(self):
+ pass
diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py
new file mode 100644
index 00000000000000..bd41fa1c9e62fb
--- /dev/null
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -0,0 +1,292 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Pixtral model."""
+
+import gc
+import unittest
+
+import requests
+
+from transformers import (
+ AutoProcessor,
+ PixtralModel,
+ PixtralVisionConfig,
+ is_torch_available,
+ is_vision_available,
+)
+from transformers.testing_utils import (
+ require_bitsandbytes,
+ require_torch,
+ slow,
+ torch_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
+
+
+if is_torch_available():
+ import torch
+else:
+ is_torch_greater_or_equal_than_2_0 = False
+
+if is_vision_available():
+ from PIL import Image
+
+
+class PixtralModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=12,
+ image_size=30,
+ patch_size=2,
+ num_channels=3,
+ is_training=True,
+ hidden_size=32,
+ projection_dim=32,
+ num_hidden_layers=2,
+ num_attention_heads=4,
+ intermediate_size=37,
+ dropout=0.1,
+ attention_dropout=0.1,
+ initializer_range=0.02,
+ scope=None,
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.is_training = is_training
+ self.hidden_size = hidden_size
+ self.projection_dim = projection_dim
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.dropout = dropout
+ self.attention_dropout = attention_dropout
+ self.initializer_range = initializer_range
+ self.scope = scope
+
+ # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+ num_patches = (image_size // patch_size) ** 2
+ self.seq_length = num_patches + 1
+
+ def prepare_config_and_inputs(self):
+ pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+ config = self.get_config()
+
+ return config, pixel_values
+
+ def get_config(self):
+ return PixtralVisionConfig(
+ image_size=self.image_size,
+ patch_size=self.patch_size,
+ num_channels=self.num_channels,
+ hidden_size=self.hidden_size,
+ projection_dim=self.projection_dim,
+ num_hidden_layers=self.num_hidden_layers,
+ num_attention_heads=self.num_attention_heads,
+ intermediate_size=self.intermediate_size,
+ dropout=self.dropout,
+ attention_dropout=self.attention_dropout,
+ initializer_range=self.initializer_range,
+ )
+
+ def create_and_check_model(self, config, pixel_values):
+ model = PixtralModel(config=config)
+ model.to(torch_device)
+ model.eval()
+ with torch.no_grad():
+ result = model(pixel_values)
+ # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+ image_size = (self.image_size, self.image_size)
+ patch_size = (self.patch_size, self.patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+ self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+ def create_and_check_model_with_projection(self, config, pixel_values):
+ model = PixtralModel(config=config)
+ model.to(torch_device)
+ model.eval()
+ with torch.no_grad():
+ result = model(pixel_values)
+ # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+ image_size = (self.image_size, self.image_size)
+ patch_size = (self.patch_size, self.patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+ self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ config, pixel_values = config_and_inputs
+ inputs_dict = {"pixel_values": pixel_values}
+ return config, inputs_dict
+
+
+@require_torch
+class PixtralModelModelTest(ModelTesterMixin, unittest.TestCase):
+ """
+ Model tester for `PixtralModel`.
+ """
+
+ all_model_classes = (PixtralModel,) if is_torch_available() else ()
+ test_pruning = False
+ test_head_masking = False
+
+ def setUp(self):
+ self.model_tester = PixtralModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=PixtralVisionConfig, has_text_modality=False)
+
+ @unittest.skip("model does not support input embeds")
+ def test_inputs_embeds(self):
+ pass
+
+ @unittest.skip("model does not support input embeds")
+ def test_inputs_embeds_matches_input_ids(self):
+ pass
+
+ @unittest.skip(
+ reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+ )
+ def test_training_gradient_checkpointing(self):
+ pass
+
+ @unittest.skip(
+ reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+ )
+ def test_training_gradient_checkpointing_use_reentrant(self):
+ pass
+
+ @unittest.skip(
+ reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+ )
+ def test_training_gradient_checkpointing_use_reentrant_false(self):
+ pass
+
+ @unittest.skip(reason="Compile not yet supported because in Pixtral models")
+ def test_sdpa_can_compile_dynamic(self):
+ pass
+
+ @unittest.skip(reason="Compile not yet supported because in Pixtral models")
+ def test_sdpa_can_dispatch_on_flash(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_attention_outputs(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_cpu_offload(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_batching_equivalence(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_disk_offload_bin(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_retain_grad_hidden_states_attentions(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_multi_gpu_data_parallel_forward(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_model_parallelism(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_model_outputs_equivalence(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_save_load(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_model_get_set_embeddings(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_resize_tokens_embeddings(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_model_main_input_name(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_initialization(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_hidden_states_output(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_gradient_checkpointing_backward_compatibility(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_feed_forward_chunking(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_disk_offload_safetensors(self):
+ pass
+
+ @unittest.skip(reason="Not supported yet")
+ def test_determinism(self):
+ pass
+
+
+@require_torch
+class PixtralModelIntegrationTest(unittest.TestCase):
+ def setUp(self):
+ self.processor = AutoProcessor.from_pretrained("hf-internal-testing/pixtral-12b")
+
+ def tearDown(self):
+ gc.collect()
+ torch.cuda.empty_cache()
+
+ @slow
+ @require_bitsandbytes
+ def test_small_model_integration_test(self):
+ # Let' s make sure we test the preprocessing to replace what is used
+ model = PixtralModel.from_pretrained("hf-internal-testing/pixtral-12b", load_in_4bit=True)
+
+ prompt = "[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
+ image_file = "https://pixtral-vl.github.io/static/images/view.jpg"
+ raw_image = Image.open(requests.get(image_file, stream=True).raw)
+ inputs = self.processor(prompt, raw_image, return_tensors="pt")
+
+ EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip
+ self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
+
+ output = model.generate(**inputs, max_new_tokens=20)
+ EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly," # fmt: skip
+
+ self.assertEqual(
+ self.processor.decode(output[0], skip_special_tokens=True),
+ EXPECTED_DECODED_TEXT,
+ )
diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
new file mode 100644
index 00000000000000..b70cab1c074480
--- /dev/null
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -0,0 +1,233 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import requests
+import torch
+
+from transformers.testing_utils import require_vision
+from transformers.utils import is_vision_available
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from transformers import AutoTokenizer, PixtralImageProcessor, PixtralProcessor
+
+
+@require_vision
+class PixtralProcessorTest(unittest.TestCase):
+ processor_class = PixtralProcessor
+
+ @classmethod
+ def setUpClass(cls):
+ cls.url_0 = "https://www.ilankelman.org/stopsigns/australia.jpg"
+ cls.image_0 = Image.open(requests.get(cls.url_0, stream=True).raw)
+ cls.url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ cls.image_1 = Image.open(requests.get(cls.url_1, stream=True).raw)
+ cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+ cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
+
+ def setUp(self):
+ super().setUp()
+
+ # FIXME - just load the processor directly from the checkpoint
+ tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/pixtral-12b")
+ image_processor = PixtralImageProcessor()
+ self.processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+ @unittest.skip("No chat template was set for this model (yet)")
+ def test_chat_template(self):
+ expected_prompt = "USER: [IMG]\nWhat is shown in this image? ASSISTANT:"
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": "What is shown in this image?"},
+ ],
+ },
+ ]
+ formatted_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+ self.assertEqual(expected_prompt, formatted_prompt)
+
+ @unittest.skip("No chat template was set for this model (yet)")
+ def test_image_token_filling(self):
+ # Important to check with non square image
+ image = torch.randint(0, 2, (3, 500, 316))
+ expected_image_tokens = 1526
+ image_token_index = 32000
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": "What is shown in this image?"},
+ ],
+ },
+ ]
+ inputs = self.processor(
+ text=[self.processor.apply_chat_template(messages)],
+ images=[image],
+ return_tensors="pt",
+ )
+ image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
+ self.assertEqual(expected_image_tokens, image_tokens)
+
+ def test_processor_with_single_image(self):
+ prompt_string = "USER: [IMG]\nWhat's the content of the image? ASSISTANT:"
+
+ # Make small for checking image token expansion
+ self.processor.image_processor.size = {"longest_edge": 30}
+ self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+
+ # Test passing in an image
+ inputs_image = self.processor(text=prompt_string, images=self.image_0, return_tensors="pt")
+ self.assertIn("input_ids", inputs_image)
+ self.assertTrue(len(inputs_image["input_ids"]) == 1)
+ self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+ self.assertIsInstance(inputs_image["pixel_values"], list)
+ self.assertTrue(len(inputs_image["pixel_values"]) == 1)
+ self.assertIsInstance(inputs_image["pixel_values"][0], list)
+ self.assertTrue(len(inputs_image["pixel_values"][0]) == 1)
+ self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
+
+ # fmt: off
+ input_ids = inputs_image["input_ids"]
+ self.assertEqual(
+ input_ids[0].tolist(),
+ # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
+ [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
+ )
+ # fmt: on
+
+ # Test passing in a url
+ inputs_url = self.processor(text=prompt_string, images=self.url_0, return_tensors="pt")
+ self.assertIn("input_ids", inputs_url)
+ self.assertTrue(len(inputs_url["input_ids"]) == 1)
+ self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+ self.assertIsInstance(inputs_url["pixel_values"], list)
+ self.assertTrue(len(inputs_url["pixel_values"]) == 1)
+ self.assertIsInstance(inputs_url["pixel_values"][0], list)
+ self.assertTrue(len(inputs_url["pixel_values"][0]) == 1)
+ self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
+
+ # fmt: off
+ input_ids = inputs_url["input_ids"]
+ self.assertEqual(
+ input_ids[0].tolist(),
+ # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
+ [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 4701, 1307, 1278, 3937, 1063, 1349, 4290, 16002, 41150, 1058]
+ )
+ # fmt: on
+
+ def test_processor_with_multiple_images_single_list(self):
+ prompt_string = "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:"
+
+ # Make small for checking image token expansion
+ self.processor.image_processor.size = {"longest_edge": 30}
+ self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+
+ # Test passing in an image
+ inputs_image = self.processor(text=prompt_string, images=[self.image_0, self.image_1], return_tensors="pt")
+ self.assertIn("input_ids", inputs_image)
+ self.assertTrue(len(inputs_image["input_ids"]) == 1)
+ self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+ self.assertIsInstance(inputs_image["pixel_values"], list)
+ self.assertTrue(len(inputs_image["pixel_values"]) == 1)
+ self.assertIsInstance(inputs_image["pixel_values"][0], list)
+ self.assertTrue(len(inputs_image["pixel_values"][0]) == 2)
+ self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
+
+ # fmt: off
+ input_ids = inputs_image["input_ids"]
+ self.assertEqual(
+ input_ids[0].tolist(),
+ # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+ [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+ )
+ # fmt: on
+
+ # Test passing in a url
+ inputs_url = self.processor(text=prompt_string, images=[self.url_0, self.url_1], return_tensors="pt")
+ self.assertIn("input_ids", inputs_url)
+ self.assertTrue(len(inputs_url["input_ids"]) == 1)
+ self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+ self.assertIsInstance(inputs_url["pixel_values"], list)
+ self.assertTrue(len(inputs_url["pixel_values"]) == 1)
+ self.assertIsInstance(inputs_url["pixel_values"][0], list)
+ self.assertTrue(len(inputs_url["pixel_values"][0]) == 2)
+ self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
+ # fmt: off
+ input_ids = inputs_url["input_ids"]
+ self.assertEqual(
+ input_ids[0].tolist(),
+ # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+ [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+ )
+ # fmt: on
+
+ def test_processor_with_multiple_images_multiple_lists(self):
+ prompt_string = [
+ "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:",
+ "USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
+ ]
+ self.processor.tokenizer.pad_token = ""
+ image_inputs = [[self.image_0, self.image_1], [self.image_2]]
+
+ # Make small for checking image token expansion
+ self.processor.image_processor.size = {"longest_edge": 30}
+ self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+
+ # Test passing in an image
+ inputs_image = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+ self.assertIn("input_ids", inputs_image)
+ self.assertTrue(len(inputs_image["input_ids"]) == 2)
+ self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+ self.assertIsInstance(inputs_image["pixel_values"], list)
+ self.assertTrue(len(inputs_image["pixel_values"]) == 2)
+ self.assertIsInstance(inputs_image["pixel_values"][0], list)
+ self.assertTrue(len(inputs_image["pixel_values"][0]) == 2)
+ self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
+
+ # fmt: off
+ input_ids = inputs_image["input_ids"]
+ self.assertEqual(
+ input_ids[0].tolist(),
+ # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+ [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+ )
+ # fmt: on
+
+ # Test passing in a url
+ inputs_url = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+ self.assertIn("input_ids", inputs_url)
+ self.assertTrue(len(inputs_url["input_ids"]) == 2)
+ self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+ self.assertIsInstance(inputs_url["pixel_values"], list)
+ self.assertTrue(len(inputs_url["pixel_values"]) == 2)
+ self.assertIsInstance(inputs_url["pixel_values"][0], list)
+ self.assertTrue(len(inputs_url["pixel_values"][0]) == 2)
+ self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
+
+ # fmt: off
+ input_ids = inputs_url["input_ids"]
+ self.assertEqual(
+ input_ids[0].tolist(),
+ # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+ [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+ )
+ # fmt: on
From 95e816f2bca48de32167ce6243e6770dee23923d Mon Sep 17 00:00:00 2001
From: Joao Gante
Date: Mon, 16 Sep 2024 09:44:57 +0100
Subject: [PATCH 09/67] Cohere: update RoPE structure (#33408)
---
.../models/cohere/configuration_cohere.py | 43 +++++
.../models/cohere/modeling_cohere.py | 170 ++++++++++++++----
src/transformers/models/dbrx/modeling_dbrx.py | 2 +-
.../models/gemma/modeling_gemma.py | 2 +-
.../models/granite/modeling_granite.py | 2 +-
.../models/llama/configuration_llama.py | 2 +-
.../models/llama/modeling_llama.py | 2 +-
.../models/mistral/modeling_mistral.py | 2 +-
.../models/mixtral/modeling_mixtral.py | 2 +-
src/transformers/models/olmo/modeling_olmo.py | 2 +-
.../models/olmoe/modeling_olmoe.py | 2 +-
.../models/persimmon/modeling_persimmon.py | 2 +-
src/transformers/models/phi/modeling_phi.py | 2 +-
src/transformers/models/phi3/modeling_phi3.py | 2 +-
.../models/qwen2/modeling_qwen2.py | 2 +-
.../models/qwen2_moe/modeling_qwen2_moe.py | 2 +-
.../models/stablelm/modeling_stablelm.py | 2 +-
.../models/starcoder2/modeling_starcoder2.py | 2 +-
18 files changed, 190 insertions(+), 55 deletions(-)
diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py
index 73973bfad60b93..3c1237e5113789 100644
--- a/src/transformers/models/cohere/configuration_cohere.py
+++ b/src/transformers/models/cohere/configuration_cohere.py
@@ -20,6 +20,7 @@
"""Cohere model configuration"""
from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
from ...utils import logging
@@ -79,6 +80,43 @@ class CohereConfig(PretrainedConfig):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
+ rope_scaling (`Dict`, *optional*):
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+ accordingly.
+ Expected contents:
+ `rope_type` (`str`):
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+ 'llama3'], with 'default' being the original RoPE implementation.
+ `factor` (`float`, *optional*):
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+ original maximum pre-trained length.
+ `original_max_position_embeddings` (`int`, *optional*):
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+ pretraining.
+ `attention_factor` (`float`, *optional*):
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
+ `factor` field to infer the suggested value.
+ `beta_fast` (`float`, *optional*):
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+ ramp function. If unspecified, it defaults to 32.
+ `beta_slow` (`float`, *optional*):
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+ ramp function. If unspecified, it defaults to 1.
+ `short_factor` (`List[float]`, *optional*):
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+ size divided by the number of attention heads divided by 2
+ `long_factor` (`List[float]`, *optional*):
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+ size divided by the number of attention heads divided by 2
+ `low_freq_factor` (`float`, *optional*):
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+ `high_freq_factor` (`float`, *optional*):
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -121,6 +159,7 @@ def __init__(
eos_token_id=255001,
tie_word_embeddings=True,
rope_theta=10000.0,
+ rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
use_qk_norm=False,
@@ -144,10 +183,14 @@ def __init__(
self.layer_norm_eps = layer_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
+ self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.use_qk_norm = use_qk_norm
+ # Validate the correctness of rotary position embeddings parameters
+ rope_config_validation(self)
+
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index 4010d9ec3a4327..ae84a9ec2d1a43 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -37,6 +37,7 @@
BaseModelOutputWithPast,
CausalLMOutputWithPast,
)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import ALL_LAYERNORM_LAYERS
from ...utils import (
@@ -135,35 +136,97 @@ def forward(self, hidden_states):
class CohereRotaryEmbedding(nn.Module):
- def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+ # Note: the forward pass of this RoPE is slightly different from Llama's, resulting in different `sin`/`cos` for
+ # the same parameterization. The differences are highlighted with a comment.
+
+ def __init__(
+ self,
+ dim=None,
+ max_position_embeddings=2048,
+ base=10000,
+ device=None,
+ scaling_factor=1.0,
+ rope_type="default",
+ config: Optional[CohereConfig] = None,
+ ):
super().__init__()
- self.scaling_factor = scaling_factor
- self.dim = dim
- self.max_position_embeddings = max_position_embeddings
- self.base = base
- inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+ # TODO (joao): remove the `if` below, only used for BC
+ self.rope_kwargs = {}
+ if config is None:
+ logger.warning_once(
+ "`CohereRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+ "`config` argument. All other arguments will be removed in v4.46"
+ )
+ self.rope_kwargs = {
+ "rope_type": rope_type,
+ "factor": scaling_factor,
+ "dim": dim,
+ "base": base,
+ "max_position_embeddings": max_position_embeddings,
+ }
+ self.rope_type = rope_type
+ self.max_seq_len_cached = max_position_embeddings
+ self.original_max_seq_len = max_position_embeddings
+ else:
+ # BC: "rope_type" was originally "type"
+ if config.rope_scaling is not None:
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+ else:
+ self.rope_type = "default"
+ self.max_seq_len_cached = config.max_position_embeddings
+ self.original_max_seq_len = config.max_position_embeddings
+
+ self.config = config
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
self.register_buffer("inv_freq", inv_freq, persistent=False)
+ self.original_inv_freq = self.inv_freq
+
+ def _dynamic_frequency_update(self, position_ids, device):
+ """
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
+ 1 - growing beyond the cached sequence length (allow scaling)
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+ """
+ seq_len = torch.max(position_ids) + 1
+ if seq_len > self.max_seq_len_cached: # growth
+ inv_freq, self.attention_scaling = self.rope_init_fn(
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
+ )
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
+ self.max_seq_len_cached = seq_len
+
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+ self.max_seq_len_cached = self.original_max_seq_len
@torch.no_grad()
def forward(self, x, position_ids):
- # x: [bs, num_attention_heads, seq_len, head_size]
+ if "dynamic" in self.rope_type:
+ self._dynamic_frequency_update(position_ids, device=x.device)
+
+ # Core RoPE block
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
position_ids_expanded = position_ids[:, None, :].float()
-
- # Force float32 since bfloat16 loses precision on long contexts
- # See https://github.com/huggingface/transformers/pull/29285
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
device_type = x.device.type
device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
with torch.autocast(device_type=device_type, enabled=False):
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
- emb = torch.repeat_interleave(freqs, 2, dim=-1)
+ emb = torch.repeat_interleave(freqs, 2, dim=-1) # This line differs from Llama's implementation
cos = emb.cos()
sin = emb.sin()
- return cos, sin
+
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+ cos = cos * self.attention_scaling
+ sin = sin * self.attention_scaling
+
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
def rotate_half(x):
- # Split and rotate
+ # Split and rotate. Note that this function is different from e.g. Llama.
x1 = x[..., ::2]
x2 = x[..., 1::2]
rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
@@ -272,17 +335,10 @@ def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
- self._init_rope()
- # Ignore copy
- def _init_rope(self):
- self.rotary_emb = CohereRotaryEmbedding(
- self.head_dim,
- max_position_embeddings=self.max_position_embeddings,
- base=self.rope_theta,
- )
+ # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
+ self.rotary_emb = CohereRotaryEmbedding(config=self.config)
- # Ignore copy
def forward(
self,
hidden_states: torch.Tensor,
@@ -292,6 +348,7 @@ def forward(
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
@@ -310,7 +367,16 @@ def forward(
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- cos, sin = self.rotary_emb(value_states, position_ids)
+ if position_embeddings is None:
+ logger.warning_once(
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+ "removed and `position_embeddings` will be mandatory."
+ )
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ else:
+ cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
@@ -350,8 +416,7 @@ def forward(
return attn_output, attn_weights, past_key_value
-# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere
-# TODO(joao): add me back asap :)
+# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere
class CohereFlashAttention2(CohereAttention):
"""
Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays
@@ -377,6 +442,7 @@ def forward(
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if isinstance(past_key_value, StaticCache):
@@ -402,7 +468,16 @@ def forward(
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- cos, sin = self.rotary_emb(value_states, position_ids)
+ if position_embeddings is None:
+ logger.warning_once(
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+ "removed and `position_embeddings` will be mandatory."
+ )
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ else:
+ cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
@@ -418,7 +493,6 @@ def forward(
dropout_rate = self.attention_dropout if self.training else 0.0
- # Ignore copy
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
@@ -465,8 +539,6 @@ def forward(
return attn_output, attn_weights, past_key_value
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere
-# TODO(joao): add me back asap :)
class CohereSdpaAttention(CohereAttention):
"""
Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -474,7 +546,6 @@ class CohereSdpaAttention(CohereAttention):
SDPA API.
"""
- # Ignore copy
def forward(
self,
hidden_states: torch.Tensor,
@@ -484,6 +555,7 @@ def forward(
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
if output_attentions:
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -517,7 +589,16 @@ def forward(
key_states = key_states.transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
- cos, sin = self.rotary_emb(value_states, position_ids)
+ if position_embeddings is None:
+ logger.warning_once(
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+ "removed and `position_embeddings` will be mandatory."
+ )
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ else:
+ cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
if past_key_value is not None:
@@ -587,6 +668,7 @@ def forward(
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
@@ -601,6 +683,11 @@ def forward(
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+ Indices depicting the position of the input sequence tokens in the sequence
+ position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+ Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+ with `head_dim` being the embedding dimension of each attention head.
"""
residual = hidden_states
@@ -615,6 +702,7 @@ def forward(
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
+ position_embeddings=position_embeddings,
)
# Fully Connected
@@ -755,8 +843,7 @@ def _init_weights(self, module):
"The bare Cohere Model outputting raw hidden-states without any specific head on top.",
COHERE_START_DOCSTRING,
)
-# copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere
-# TODO(joao): add me back asap :)
+# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere, LLAMA->COHERE
class CohereModel(CoherePreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
@@ -776,6 +863,7 @@ def __init__(self, config: CohereConfig):
[CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
)
self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
+ self.rotary_emb = CohereRotaryEmbedding(config=config)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
@@ -787,14 +875,13 @@ def get_input_embeddings(self):
def set_input_embeddings(self, value):
self.embed_tokens = value
- # Ignore copy
@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
- past_key_values: Optional[List[torch.FloatTensor]] = None,
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
@@ -823,30 +910,33 @@ def forward(
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
- past_seen_tokens = 0
return_legacy_cache = False
if (
use_cache and not isinstance(past_key_values, Cache) and not self.training
): # kept for BC (non `Cache` `past_key_values` inputs)
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+ logger.warning_once(
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
+ "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
+ )
if cache_position is None:
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
cache_position = torch.arange(
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
)
-
if position_ids is None:
position_ids = cache_position.unsqueeze(0)
causal_mask = self._update_causal_mask(
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
)
-
- # embed positions
hidden_states = inputs_embeds
+ # create position embeddings to be shared across the decoder layers
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
@@ -866,6 +956,7 @@ def forward(
output_attentions,
use_cache,
cache_position,
+ position_embeddings,
)
else:
layer_outputs = decoder_layer(
@@ -876,6 +967,7 @@ def forward(
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
+ position_embeddings=position_embeddings,
)
hidden_states = layer_outputs[0]
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 8db9f6e8b7d09f..43bac44ba1be20 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1066,7 +1066,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index 085751cd9bc039..b14e0a4b3d8ca5 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -862,7 +862,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
index ff10b6e6d875f9..876f5ed2a7c8da 100644
--- a/src/transformers/models/granite/modeling_granite.py
+++ b/src/transformers/models/granite/modeling_granite.py
@@ -839,7 +839,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index 435f0091e06e70..a3667e06534564 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -192,7 +192,7 @@ def __init__(
self.mlp_bias = mlp_bias
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
# Validate the correctness of rotary position embeddings parameters
- # BC: if there is a 'type' field, move it to 'rope_type'.
+ # BC: if there is a 'type' field, copy it it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
rope_config_validation(self)
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 9a1d6c0749f932..c7017832b9324c 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -951,7 +951,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index c43418182c3881..ffe16b27203301 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -767,7 +767,7 @@ def forward(
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
return_legacy_cache = True
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 2e23d06699087e..c7062e75b1085c 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -1023,7 +1023,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 007e69570e7821..b4bda8e2db5251 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -873,7 +873,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index a53f1eeda61196..a33338365312db 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -1012,7 +1012,7 @@ def forward(
return_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 9fab09bdcc7877..ccaa2c7fd29aae 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -690,7 +690,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 0d8be04af20d5c..648d1653a3b503 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -981,7 +981,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index 273b6a8f505e79..ec395679ae6207 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -1008,7 +1008,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 030c74b034b794..d0ea8ef0e376e0 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -920,7 +920,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index b196ed72a49b23..6f483e50cde065 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -1084,7 +1084,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index 27d0c856a61bd6..d91c0832ed33da 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -965,7 +965,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index c359c07c69c0b8..0be37c4e1fb91c 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -894,7 +894,7 @@ def forward(
use_legacy_cache = True
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
logger.warning_once(
- "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
"Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
)
From 5ce0a113b5bc9dd8dbb92dd866772d79847d9a92 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:07:59 +0200
Subject: [PATCH 10/67] Fix SSH workflow (#33451)
* fix
* update
---------
Co-authored-by: ydshieh
---
.github/workflows/ssh-runner.yml | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
index b433abb484fac4..db649876f60492 100644
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -58,8 +58,19 @@ jobs:
#because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
shell: bash
run: |
- if [ "${{ secrets[format('{0}_{1}', github.actor, 'SLACK_ID')] }}" != "" ]; then
- echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', github.actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
+ echo "${{ github.actor }}"
+ github_actor=${{ github.actor }}
+ github_actor=${github_actor/'-'/'_'}
+ echo "$github_actor"
+ echo "github_actor=$github_actor" >> $GITHUB_ENV
+
+ - name: Store Slack infos
+ #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
+ shell: bash
+ run: |
+ echo "${{ env.github_actor }}"
+ if [ "${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" != "" ]; then
+ echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
else
echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
fi
From ce62a41880b5b70a304d068eb58f55894a5a7af8 Mon Sep 17 00:00:00 2001
From: Merve Noyan
Date: Mon, 16 Sep 2024 13:08:31 +0200
Subject: [PATCH 11/67] Add keypoint-detection task guide (#33274)
---------
Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
docs/source/en/_toctree.yml | 2 +
docs/source/en/tasks/keypoint_detection.md | 154 +++++++++++++++++++++
2 files changed, 156 insertions(+)
create mode 100644 docs/source/en/tasks/keypoint_detection.md
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 235ea81a7f1ea6..7eff2a38302669 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -81,6 +81,8 @@
title: Image Feature Extraction
- local: tasks/mask_generation
title: Mask Generation
+ - local: tasks/keypoint_detection
+ title: Keypoint Detection
- local: tasks/knowledge_distillation_for_image_classification
title: Knowledge Distillation for Computer Vision
title: Computer Vision
diff --git a/docs/source/en/tasks/keypoint_detection.md b/docs/source/en/tasks/keypoint_detection.md
new file mode 100644
index 00000000000000..a0ec71a5c22000
--- /dev/null
+++ b/docs/source/en/tasks/keypoint_detection.md
@@ -0,0 +1,154 @@
+
+
+# Keypoint Detection
+
+[[open-in-colab]]
+
+Keypoint detection identifies and locates specific points of interest within an image. These keypoints, also known as landmarks, represent meaningful features of objects, such as facial features or object parts. These models take an image input and return the following outputs:
+
+- **Keypoints and Scores**: Points of interest and their confidence scores.
+- **Descriptors**: A representation of the image region surrounding each keypoint, capturing its texture, gradient, orientation and other properties.
+
+In this guide, we will show how to extract keypoints from images.
+
+For this tutorial, we will use [SuperPoint](./model_doc/superpoint.md), a foundation model for keypoint detection.
+
+```python
+from transformers import AutoImageProcessor, SuperPointForKeypointDetection
+processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
+model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+```
+
+Let's test the model on the images below.
+
+
+
+
+
+
+
+```python
+import torch
+from PIL import Image
+import requests
+import cv2
+
+
+url_image_1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
+image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
+url_image_2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"
+image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
+
+images = [image_1, image_2]
+```
+
+We can now process our inputs and infer.
+
+```python
+inputs = processor(images,return_tensors="pt").to(model.device, model.dtype)
+outputs = model(**inputs)
+```
+
+The model output has relative keypoints, descriptors, masks and scores for each item in the batch. The mask highlights areas of the image where keypoints are present.
+
+```python
+SuperPointKeypointDescriptionOutput(loss=None, keypoints=tensor([[[0.0437, 0.0167],
+ [0.0688, 0.0167],
+ [0.0172, 0.0188],
+ ...,
+ [0.5984, 0.9812],
+ [0.6953, 0.9812]]]),
+ scores=tensor([[0.0056, 0.0053, 0.0079, ..., 0.0125, 0.0539, 0.0377],
+ [0.0206, 0.0058, 0.0065, ..., 0.0000, 0.0000, 0.0000]],
+ grad_fn=), descriptors=tensor([[[-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357],
+ [-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357],
+ [-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357],
+ ...],
+ grad_fn=), mask=tensor([[1, 1, 1, ..., 1, 1, 1],
+ [1, 1, 1, ..., 0, 0, 0]], dtype=torch.int32), hidden_states=None)
+```
+
+To plot actual keypoints in the image, we need to postprocess the output. To do so, we have to pass the actual image sizes to `post_process_keypoint_detection` along with outputs.
+
+```python
+image_sizes = [(image.size[1], image.size[0]) for image in images]
+outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
+```
+
+The outputs are now a list of dictionaries where each dictionary is a processed output of keypoints, scores and descriptors.
+
+```python
+[{'keypoints': tensor([[ 226, 57],
+ [ 356, 57],
+ [ 89, 64],
+ ...,
+ [3604, 3391]], dtype=torch.int32),
+ 'scores': tensor([0.0056, 0.0053, ...], grad_fn=),
+ 'descriptors': tensor([[-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357],
+ [-0.0807, 0.0114, -0.1210, ..., -0.1122, 0.0899, 0.0357]],
+ grad_fn=)},
+ {'keypoints': tensor([[ 46, 6],
+ [ 78, 6],
+ [422, 6],
+ [206, 404]], dtype=torch.int32),
+ 'scores': tensor([0.0206, 0.0058, 0.0065, 0.0053, 0.0070, ...,grad_fn=),
+ 'descriptors': tensor([[-0.0525, 0.0726, 0.0270, ..., 0.0389, -0.0189, -0.0211],
+ [-0.0525, 0.0726, 0.0270, ..., 0.0389, -0.0189, -0.0211]}]
+```
+
+We can use these to plot the keypoints.
+
+```python
+import matplotlib.pyplot as plt
+import torch
+
+for i in range(len(images)):
+ keypoints = outputs[i]["keypoints"]
+ scores = outputs[i]["scores"]
+ descriptors = outputs[i]["descriptors"]
+ keypoints = outputs[i]["keypoints"].detach().numpy()
+ scores = outputs[i]["scores"].detach().numpy()
+ image = images[i]
+ image_width, image_height = image.size
+
+ plt.axis('off')
+ plt.imshow(image)
+ plt.scatter(
+ keypoints[:, 0],
+ keypoints[:, 1],
+ s=scores * 100,
+ c='cyan',
+ alpha=0.4
+ )
+ plt.show()
+```
+
+Below you can see the outputs.
+
+
+
+
+
+
From 2f62146f0e916c3e6752b59d34853be6df0506f2 Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:26:26 -0400
Subject: [PATCH 12/67] Uniformize kwargs for LLaVa processor and update docs
(#32858)
* Uniformize kwargs for LlaVa and update docs
* Change order of processor inputs in docstring
* Improve BC support for reversed images and text inputs
* cleanup llava processor call docstring
* Add encoded inputs as valid text inputs in reverse input check, add deprecation version in warning
* Put function check reversed images text outside base processor class
* Refactor _validate_images_text_input_order
* Add ProcessingUtilTester
* fix processing and test_processing
---
.../models/llava/modeling_llava.py | 2 +-
.../models/llava/processing_llava.py | 73 ++++++++++---------
tests/models/llava/test_modeling_llava.py | 20 ++---
tests/models/llava/test_processor_llava.py | 57 ++++++++++++++-
4 files changed, 104 insertions(+), 48 deletions(-)
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 9ad19ccee72228..eb1c55341b0784 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -405,7 +405,7 @@ def forward(
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+ >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 678724ae95be41..28a9410e6cbf0b 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -16,18 +16,33 @@
Processor class for Llava.
"""
-from typing import List, Optional, Union
+import sys
+from typing import List, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput, get_image_size, to_numpy_array
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, _validate_images_text_input_order
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+if sys.version_info >= (3, 11):
+ from typing import Unpack
+else:
+ from typing_extensions import Unpack
+
logger = logging.get_logger(__name__)
+class LlavaProcessorKwargs(ProcessingKwargs, total=False):
+ _defaults = {
+ "text_kwargs": {
+ "padding": False,
+ },
+ "images_kwargs": {},
+ }
+
+
class LlavaProcessor(ProcessorMixin):
r"""
Constructs a Llava processor which wraps a Llava image processor and a Llava tokenizer into a single processor.
@@ -73,12 +88,11 @@ def __init__(
def __call__(
self,
- text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
images: ImageInput = None,
- padding: Union[bool, str, PaddingStrategy] = False,
- truncation: Union[bool, str, TruncationStrategy] = None,
- max_length=None,
- return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+ audio=None,
+ videos=None,
+ **kwargs: Unpack[LlavaProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -88,29 +102,15 @@ def __call__(
of the above two methods for more information.
Args:
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. Both channels-first and channels-last formats are supported.
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
- images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
- The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
- tensor. Both channels-first and channels-last formats are supported.
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
- Select a strategy to pad the returned sequences (according to the model's padding side and padding
- index) among:
- - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
- sequence if provided).
- - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
- acceptable input length for the model if that argument is not provided.
- - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
- lengths).
- max_length (`int`, *optional*):
- Maximum length of the returned list and optionally padding length (see above).
- truncation (`bool`, *optional*):
- Activates truncation to cut input sequences longer than `max_length` to `max_length`.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
-
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
@@ -125,8 +125,19 @@ def __call__(
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
+ if images is None and text is None:
+ raise ValueError("You have to specify at least one of `images` or `text`.")
+
+ # check if images and text inputs are reversed for BC
+ images, text = _validate_images_text_input_order(images, text)
+
+ output_kwargs = self._merge_kwargs(
+ LlavaProcessorKwargs,
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+ **kwargs,
+ )
if images is not None:
- image_inputs = self.image_processor(images, return_tensors=return_tensors)
+ image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
else:
image_inputs = {}
@@ -158,13 +169,7 @@ def __call__(
"Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
)
- text_inputs = self.tokenizer(
- prompt_strings,
- return_tensors=return_tensors,
- padding=padding,
- truncation=truncation,
- max_length=max_length,
- )
+ text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
return BatchFeature(data={**text_inputs, **image_inputs})
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 5c05480ffa6dbb..305fc9e9a84cdb 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -274,7 +274,7 @@ def test_small_model_integration_test(self):
prompt = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
image_file = "https://llava-vl.github.io/static/images/view.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
- inputs = self.processor(prompt, raw_image, return_tensors="pt")
+ inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt")
EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip
self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
@@ -299,7 +299,7 @@ def test_small_model_integration_test_llama_single(self):
prompt = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
image_file = "https://llava-vl.github.io/static/images/view.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
- inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+ inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip
@@ -325,7 +325,7 @@ def test_small_model_integration_test_llama_batched(self):
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
- inputs = processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
+ inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
output = model.generate(**inputs, max_new_tokens=20)
@@ -349,7 +349,7 @@ def test_small_model_integration_test_batch(self):
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
- inputs = self.processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
+ inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
output = model.generate(**inputs, max_new_tokens=20)
@@ -381,7 +381,7 @@ def test_small_model_integration_test_llama_batched_regression(self):
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
- inputs = processor(prompts, images=[image1, image2, image1], return_tensors="pt", padding=True)
+ inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True)
output = model.generate(**inputs, max_new_tokens=20)
@@ -409,8 +409,8 @@ def test_batched_generation(self):
image2 = Image.open(requests.get(url2, stream=True).raw)
inputs = processor(
- text=[prompt1, prompt2, prompt3],
images=[image1, image2, image1, image2],
+ text=[prompt1, prompt2, prompt3],
return_tensors="pt",
padding=True,
).to(torch_device)
@@ -444,7 +444,7 @@ def test_llava_index_error_bug(self):
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
- inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+ inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
# Make sure that `generate` works
_ = model.generate(**inputs, max_new_tokens=20)
@@ -510,7 +510,7 @@ def test_generation_no_images(self):
processor = AutoProcessor.from_pretrained(model_id)
# Prepare inputs with no images
- inputs = processor("Hello, I am", return_tensors="pt").to(torch_device)
+ inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device)
# Make sure that `generate` works
_ = model.generate(**inputs, max_new_tokens=20)
@@ -554,13 +554,13 @@ def test_expansion_in_processing(self):
# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
- inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+ inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
- inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+ inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
self.assertTrue(inputs.input_ids.shape[-1] == 18)
# generate exactly 20 tokens
diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py
index 54c1b4674cbcef..5b05a8b92ea513 100644
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -11,18 +11,43 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import shutil
+import tempfile
import unittest
-from transformers.testing_utils import require_vision
+from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor
+from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available
+from ...test_processing_common import ProcessorTesterMixin
+
if is_vision_available():
- from transformers import AutoTokenizer, LlavaProcessor
+ from transformers import CLIPImageProcessor
@require_vision
-class LlavaProcessorTest(unittest.TestCase):
+class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+ processor_class = LlavaProcessor
+
+ def setUp(self):
+ self.tmpdirname = tempfile.mkdtemp()
+ image_processor = CLIPImageProcessor(do_center_crop=False)
+ tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
+
+ processor = LlavaProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+ processor.save_pretrained(self.tmpdirname)
+
+ def get_tokenizer(self, **kwargs):
+ return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+ def get_image_processor(self, **kwargs):
+ return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+ def tearDown(self):
+ shutil.rmtree(self.tmpdirname)
+
def test_can_load_various_tokenizers(self):
for checkpoint in ["Intel/llava-gemma-2b", "llava-hf/llava-1.5-7b-hf"]:
processor = LlavaProcessor.from_pretrained(checkpoint)
@@ -45,3 +70,29 @@ def test_chat_template(self):
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
self.assertEqual(expected_prompt, formatted_prompt)
+
+ @require_torch
+ @require_vision
+ def test_unstructured_kwargs_batched(self):
+ if "image_processor" not in self.processor_class.attributes:
+ self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+ image_processor = self.get_component("image_processor")
+ tokenizer = self.get_component("tokenizer")
+
+ processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+ self.skip_processor_without_typed_kwargs(processor)
+
+ input_str = ["lower newer", "upper older longer string"]
+ image_input = self.prepare_image_inputs() * 2
+ inputs = processor(
+ images=image_input,
+ text=input_str,
+ return_tensors="pt",
+ size={"height": 214, "width": 214},
+ padding="longest",
+ max_length=76,
+ )
+
+ self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+ self.assertEqual(len(inputs["input_ids"][0]), 5)
From c7a91f5adf976e0517c4a7f1506fb0c24f353053 Mon Sep 17 00:00:00 2001
From: Sergio Paniego Blanco
Date: Mon, 16 Sep 2024 18:52:27 +0200
Subject: [PATCH 13/67] `Agents, supercharged - Multi-agents, External tools,
and more` docs typo fixed (#33478)
* Typo fixed in Agents, supercharged
---
docs/source/en/agents_advanced.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md
index e7469a310c4102..399eeb9b70eb20 100644
--- a/docs/source/en/agents_advanced.md
+++ b/docs/source/en/agents_advanced.md
@@ -34,7 +34,7 @@ You can easily build hierarchical multi-agent systems with `transformers.agents`
To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools.
-Here's an example of making an agent that managed a specitif web search agent using our [`DuckDuckGoSearchTool`]:
+Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]:
```py
from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent
From c2d05897bf4e8b34773838accaddd66028bc148d Mon Sep 17 00:00:00 2001
From: Ahmed Almaghz <53489256+AhmedAlmaghz@users.noreply.github.com>
Date: Mon, 16 Sep 2024 20:02:03 +0300
Subject: [PATCH 14/67] [i18n-ar] Add File : `docs/source/ar/_toctree.yml`
(#32696)
* Update ar lang build_documentation.yml
* Update ar lang build_pr_documentation.yml
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/pipeline_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/autoclass_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/preprocessing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/training.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/run_scripts.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/accelerate.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Create _config.py
* Update _toctree.yml
* Update _toctree.yml
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/peft.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update _toctree.yml
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/model_sharing.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/conversations.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/agents.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update docs/source/ar/llm_tutorial.md
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
* Update llm_tutorial.md
* Update _toctree.yml
* Update autoclass_tutorial.md
* Update autoclass_tutorial.md
* Update preprocessing.md
* Update glossary.md
* Update run_scripts.md
* Update run_scripts.md
* Update run_scripts.md
---------
Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
---
.github/workflows/build_documentation.yml | 2 +-
.github/workflows/build_pr_documentation.yml | 2 +-
docs/source/ar/_config.py | 14 +
docs/source/ar/_toctree.yml | 892 +++++++++++++++++++
docs/source/ar/accelerate.md | 120 +++
docs/source/ar/agents.md | 539 +++++++++++
docs/source/ar/autoclass_tutorial.md | 167 ++++
docs/source/ar/conversations.md | 204 +++++
docs/source/ar/glossary.md | 446 ++++++++++
docs/source/ar/index.md | 342 +++++++
docs/source/ar/installation.md | 246 +++++
docs/source/ar/llm_tutorial.md | 248 ++++++
docs/source/ar/model_sharing.md | 223 +++++
docs/source/ar/peft.md | 250 ++++++
docs/source/ar/pipeline_tutorial.md | 315 +++++++
docs/source/ar/preprocessing.md | 521 +++++++++++
docs/source/ar/quicktour.md | 543 +++++++++++
docs/source/ar/run_scripts.md | 351 ++++++++
docs/source/ar/training.md | 412 +++++++++
19 files changed, 5835 insertions(+), 2 deletions(-)
create mode 100644 docs/source/ar/_config.py
create mode 100644 docs/source/ar/_toctree.yml
create mode 100644 docs/source/ar/accelerate.md
create mode 100644 docs/source/ar/agents.md
create mode 100644 docs/source/ar/autoclass_tutorial.md
create mode 100644 docs/source/ar/conversations.md
create mode 100644 docs/source/ar/glossary.md
create mode 100644 docs/source/ar/index.md
create mode 100644 docs/source/ar/installation.md
create mode 100644 docs/source/ar/llm_tutorial.md
create mode 100644 docs/source/ar/model_sharing.md
create mode 100644 docs/source/ar/peft.md
create mode 100644 docs/source/ar/pipeline_tutorial.md
create mode 100644 docs/source/ar/preprocessing.md
create mode 100644 docs/source/ar/quicktour.md
create mode 100644 docs/source/ar/run_scripts.md
create mode 100644 docs/source/ar/training.md
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index e3e3b5f2df37f1..b25567fb092a14 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -15,7 +15,7 @@ jobs:
commit_sha: ${{ github.sha }}
package: transformers
notebook_folder: transformers_doc
- languages: de en es fr hi it ko pt tr zh ja te
+ languages: ar de en es fr hi it ko pt tr zh ja te
custom_container: huggingface/transformers-doc-builder
secrets:
token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index c8d073ea34688f..f698f860b2f93c 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -14,5 +14,5 @@ jobs:
commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }}
package: transformers
- languages: de en es fr hi it ko pt tr zh ja te
+ languages: ar de en es fr hi it ko pt tr zh ja te
custom_container: huggingface/transformers-doc-builder
diff --git a/docs/source/ar/_config.py b/docs/source/ar/_config.py
new file mode 100644
index 00000000000000..f49e4e4731965a
--- /dev/null
+++ b/docs/source/ar/_config.py
@@ -0,0 +1,14 @@
+# docstyle-ignore
+INSTALL_CONTENT = """
+# Transformers installation
+! pip install transformers datasets evaluate accelerate
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/transformers.git
+"""
+
+notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
+black_avoid_patterns = {
+ "{processor_class}": "FakeProcessorClass",
+ "{model_class}": "FakeModelClass",
+ "{object_class}": "FakeObjectClass",
+}
diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml
new file mode 100644
index 00000000000000..39e0ae14e19c29
--- /dev/null
+++ b/docs/source/ar/_toctree.yml
@@ -0,0 +1,892 @@
+- sections:
+ - local: index
+ title: 🤗 المحولات
+ - local: quicktour
+ title: جولة سريعة
+ - local: installation
+ title: التثبيت
+ title: البدء
+- sections:
+ - local: pipeline_tutorial
+ title: تشغيل الاستنتاج باستخدام خطوط الأنابيب
+ - local: autoclass_tutorial
+ title: كتابة تعليمات برمجية متكيفه باستخدام AutoClass
+ - local: preprocessing
+ title: معالجة البيانات مسبقًا
+ - local: training
+ title: ضبط نموذج مسبق التدريب
+ - local: run_scripts
+ title: التدريب باستخدام نص برمجي
+ - local: accelerate
+ title: إعداد تدريب موزع باستخدام 🤗 Accelerate
+ - local: peft
+ title: تحميل النماذج المخصصة وتدريبها باستخدام 🤗 PEFT
+ - local: model_sharing
+ title: مشاركة نموذجك
+ - local: agents
+ title: الوكلاء
+ - local: llm_tutorial
+ title: التوليد باستخدام LLMs
+ - local: conversations
+ title: الدردشة مع المحولات
+ title: البرامج التعليمية
+# - sections:
+# - isExpanded: false
+# sections:
+# - local: tasks/sequence_classification
+# title: تصنيف النصوص
+# - local: tasks/token_classification
+# title: تصنيف الرموز
+# - local: tasks/question_answering
+# title: الإجابة على الأسئلة
+# - local: tasks/language_modeling
+# title: نمذجة اللغة السببية
+# - local: tasks/masked_language_modeling
+# title: نمذجة اللغة المقنعة
+# - local: tasks/translation
+# title: الترجمة
+# - local: tasks/summarization
+# title: التلخيص
+# - local: tasks/multiple_choice
+# title: الاختيار المتعدد
+# title: معالجة اللغات الطبيعية
+# - isExpanded: false
+# sections:
+# - local: tasks/audio_classification
+# title: تصنيف الصوت
+# - local: tasks/asr
+# title: التعرف التلقائي على الكلام
+# title: الصوت
+# - isExpanded: false
+# sections:
+# - local: tasks/image_classification
+# title: تصنيف الصور
+# - local: tasks/semantic_segmentation
+# title: تجزئة الصور
+# - local: tasks/video_classification
+# title: تصنيف الفيديو
+# - local: tasks/object_detection
+# title: اكتشاف الأشياء
+# - local: tasks/zero_shot_object_detection
+# title: اكتشاف الأشياء بدون تدريب
+# - local: tasks/zero_shot_image_classification
+# title: تصنيف الصور بدون تدريب
+# - local: tasks/monocular_depth_estimation
+# title: تقدير العمق
+# - local: tasks/image_to_image
+# title: صورة إلى صورة
+# - local: tasks/image_feature_extraction
+# title: استخراج ميزات الصورة
+# - local: tasks/mask_generation
+# title: توليد القناع
+# - local: tasks/knowledge_distillation_for_image_classification
+# title: التقليل المعرفي للرؤية الحاسوبية
+# title: الرؤية الحاسوبية
+# - isExpanded: false
+# sections:
+# - local: tasks/image_captioning
+# title: وصف الصور Image captioning
+# - local: tasks/document_question_answering
+# title: الإجابة على أسئلة المستندات
+# - local: tasks/visual_question_answering
+# title: الإجابة على الأسئلة المرئية
+# - local: tasks/text-to-speech
+# title: تحويل النص إلى كلام
+# title: المتعددة الوسائط
+# - isExpanded: false
+# sections:
+# - local: generation_strategies
+# title: تخصيص استراتيجية التوليد
+# - local: kv_cache
+# title: أفضل الممارسات للتوليد باستخدام ذاكرة التخزين المؤقت
+# title: التوليد
+# - isExpanded: false
+# sections:
+# - local: tasks/idefics
+# title: مهام الصور مع IDEFICS
+# - local: tasks/prompting
+# title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة
+# title: الإرشاد
+# title: أدلة المهام
+# - sections:
+# - local: fast_tokenizers
+# title: استخدم برامج التجزئة السريعة من 🤗 Tokenizers
+# - local: multilingual
+# title: تشغيل الاستنتاج باستخدام نماذج متعددة اللغات
+# - local: create_a_model
+# title: استخدام واجهات برمجة التطبيقات الخاصة بالنموذج
+# - local: custom_models
+# title: مشاركة نموذج مخصص
+# - local: chat_templating
+# title: قوالب لنماذج الدردشة
+# - local: trainer
+# title: المدرب
+# - local: sagemaker
+# title: تشغيل التدريب على Amazon SageMaker
+# - local: serialization
+# title: التصدير إلى ONNX
+# - local: tflite
+# title: التصدير إلى TFLite
+# - local: torchscript
+# title: التصدير إلى TorchScript
+# - local: benchmarks
+# title: المعايير
+# - local: notebooks
+# title: دفاتر الملاحظات مع الأمثلة
+# - local: community
+# title: موارد المجتمع
+# - local: troubleshooting
+# title: استكشاف الأخطاء وإصلاحها
+# - local: gguf
+# title: التوافق مع ملفات GGUF
+# title: أدلة المطورين
+# - sections:
+# - local: quantization/overview
+# title: نظرة عامة
+# - local: quantization/bitsandbytes
+# title: bitsandbytes
+# - local: quantization/gptq
+# title: GPTQ
+# - local: quantization/awq
+# title: AWQ
+# - local: quantization/aqlm
+# title: AQLM
+# - local: quantization/quanto
+# title: Quanto
+# - local: quantization/eetq
+# title: EETQ
+# - local: quantization/hqq
+# title: HQQ
+# - local: quantization/optimum
+# title: Optimum
+# - local: quantization/contribute
+# title: المساهمة بطريقة جديدة للتكميم
+# title: أساليب التكميم
+# - sections:
+# - local: performance
+# title: الأداء-نظرة عامة
+# - local: llm_optims
+# title: تحسين الاستدلال LLM
+# - sections:
+# - local: perf_train_gpu_one
+# title: استخدام عدة وحدات معالجة رسوميات (GPUs) بشكل متوازٍ
+# - local: perf_train_gpu_many
+# title: وحدات معالجة الرسومات (GPU) متعددة والتوازي
+# - local: fsdp
+# title: Fully Sharded Data Parallel
+# - local: deepspeed
+# title: DeepSpeed
+# - local: perf_train_cpu
+# title: التدريب الفعال على وحدة المعالجة المركزية (CPU)
+# - local: perf_train_cpu_many
+# title: التدريب الموزع لوحدة المعالجة المركزية (CPU)
+# - local: perf_train_tpu_tf
+# title: التدريب على (TPU) باستخدام TensorFlow
+# - local: perf_train_special
+# title: تدريب PyTorch على Apple silicon
+# - local: perf_hardware
+# title: الأجهزة المخصصة للتدريب
+# - local: hpo_train
+# title: البحث عن المعاملات المثلى باستخدام واجهة برمجة تطبيقات المدرب
+# title: تقنيات التدريب الفعال
+# - sections:
+# - local: perf_infer_cpu
+# title: الإستدلال على وحدة المعالجة المركزية (CPU)
+# - local: perf_infer_gpu_one
+# title: الإستدلال على وحدة معالجة الرسومات (GPU)
+# title: تحسين الاستدلال
+# - local: big_models
+# title: إنشاء نموذج كبير
+# - local: debugging
+# title: تصحيح الأخطاء البرمجية
+# - local: tf_xla
+# title: تكامل XLA لنماذج TensorFlow
+# - local: perf_torch_compile
+# title: تحسين الاستدلال باستخدام `torch.compile()`
+# title: الأداء وقابلية التوسع
+# - sections:
+# - local: contributing
+# title: كيفية المساهمة في 🤗 المحولات؟
+# - local: add_new_model
+# title: كيفية إضافة نموذج إلى 🤗 المحولات؟
+# - local: add_new_pipeline
+# title: كيفية إضافة خط أنابيب إلى 🤗 المحولات؟
+# - local: testing
+# title: الاختبار
+# - local: pr_checks
+# title: التحقق من طلب السحب
+# title: المساهمة
+- sections:
+ # - local: philosophy
+ # title: الفلسفة
+ - local: glossary
+ title: (قاموس المصطلحات (قائمة الكلمات
+ # - local: task_summary
+ # title: ما الذي يمكن أن تفعله 🤗 المحولات
+ # - local: tasks_explained
+ # title: كيف تحل المحولات المهام
+ # - local: model_summary
+ # title: عائلة نماذج المحول
+ # - local: tokenizer_summary
+ # title: ملخص برنامج مقسم النصوص (tokenizers)
+ # - local: attention
+ # title: الانتباه Attention
+ # - local: pad_truncation
+ # title: الحشو والتقليم
+ # - local: bertology
+ # title: BERTology
+ # - local: perplexity
+ # title: حيرة النماذج ذات الطول الثابت
+ # - local: pipeline_webserver
+ # title: خطوط الأنابيب للاستدلال على خادم الويب
+ # - local: model_memory_anatomy
+ # title: تشريح تدريب النموذج
+ # - local: llm_tutorial_optimization
+ # title: الاستفادة القصوى من LLMs
+ title: أطر مفاهيمية
+# - sections:
+# - sections:
+# - local: main_classes/agent
+# title: الوكلاء والأدوات
+# - local: model_doc/auto
+# title: فئات يتم إنشاؤها ديناميكيًا
+# - local: main_classes/backbones
+# title: العمود الفقري
+# - local: main_classes/callback
+# title: عمليات الاسترجاع
+# - local: main_classes/configuration
+# title: التكوين
+# - local: main_classes/data_collator
+# title: مجمع البيانات
+# - local: main_classes/keras_callbacks
+# title: استدعاءات Keras
+# - local: main_classes/logging
+# title: التسجيل
+# - local: main_classes/model
+# title: النماذج
+# - local: main_classes/text_generation
+# title: توليد النصوص
+# - local: main_classes/onnx
+# title: ONNX
+# - local: main_classes/optimizer_schedules
+# title: التحسين
+# - local: main_classes/output
+# title: مخرجات النموذج
+# - local: main_classes/pipelines
+# title: خطوط الأنابيب
+# - local: main_classes/processors
+# title: المعالجات
+# - local: main_classes/quantization
+# title: التكميم
+# - local: main_classes/tokenizer
+# title: برنامج مقسم النصوص
+# - local: main_classes/trainer
+# title: المدرب
+# - local: main_classes/deepspeed
+# title: DeepSpeed
+# - local: main_classes/feature_extractor
+# title: مستخرج الميزات
+# - local: main_classes/image_processor
+# title: معالج الصور
+# title: الفئات الرئيسية
+# - sections:
+# - isExpanded: false
+# sections:
+# - local: model_doc/albert
+# title: ALBERT
+# - local: model_doc/bart
+# title: BART
+# - local: model_doc/barthez
+# title: BARThez
+# - local: model_doc/bartpho
+# title: BARTpho
+# - local: model_doc/bert
+# title: BERT
+# - local: model_doc/bert-generation
+# title: BertGeneration
+# - local: model_doc/bert-japanese
+# title: BertJapanese
+# - local: model_doc/bertweet
+# title: Bertweet
+# - local: model_doc/big_bird
+# title: BigBird
+# - local: model_doc/bigbird_pegasus
+# title: BigBirdPegasus
+# - local: model_doc/biogpt
+# title: BioGpt
+# - local: model_doc/blenderbot
+# title: Blenderbot
+# - local: model_doc/blenderbot-small
+# title: Blenderbot Small
+# - local: model_doc/bloom
+# title: BLOOM
+# - local: model_doc/bort
+# title: BORT
+# - local: model_doc/byt5
+# title: ByT5
+# - local: model_doc/camembert
+# title: CamemBERT
+# - local: model_doc/canine
+# title: CANINE
+# - local: model_doc/codegen
+# title: CodeGen
+# - local: model_doc/code_llama
+# title: CodeLlama
+# - local: model_doc/cohere
+# title: Cohere
+# - local: model_doc/convbert
+# title: ConvBERT
+# - local: model_doc/cpm
+# title: CPM
+# - local: model_doc/cpmant
+# title: CPMANT
+# - local: model_doc/ctrl
+# title: CTRL
+# - local: model_doc/dbrx
+# title: DBRX
+# - local: model_doc/deberta
+# title: DeBERTa
+# - local: model_doc/deberta-v2
+# title: DeBERTa-v2
+# - local: model_doc/dialogpt
+# title: DialoGPT
+# - local: model_doc/distilbert
+# title: DistilBERT
+# - local: model_doc/dpr
+# title: DPR
+# - local: model_doc/electra
+# title: ELECTRA
+# - local: model_doc/encoder-decoder
+# title: Encoder Decoder Models
+# - local: model_doc/ernie
+# title: ERNIE
+# - local: model_doc/ernie_m
+# title: ErnieM
+# - local: model_doc/esm
+# title: ESM
+# - local: model_doc/falcon
+# title: Falcon
+# - local: model_doc/fastspeech2_conformer
+# title: FastSpeech2Conformer
+# - local: model_doc/flan-t5
+# title: FLAN-T5
+# - local: model_doc/flan-ul2
+# title: FLAN-UL2
+# - local: model_doc/flaubert
+# title: FlauBERT
+# - local: model_doc/fnet
+# title: FNet
+# - local: model_doc/fsmt
+# title: FSMT
+# - local: model_doc/funnel
+# title: Funnel Transformer
+# - local: model_doc/fuyu
+# title: Fuyu
+# - local: model_doc/gemma
+# title: Gemma
+# - local: model_doc/openai-gpt
+# title: GPT
+# - local: model_doc/gpt_neo
+# title: GPT Neo
+# - local: model_doc/gpt_neox
+# title: GPT NeoX
+# - local: model_doc/gpt_neox_japanese
+# title: GPT NeoX Japanese
+# - local: model_doc/gptj
+# title: GPT-J
+# - local: model_doc/gpt2
+# title: GPT2
+# - local: model_doc/gpt_bigcode
+# title: GPTBigCode
+# - local: model_doc/gptsan-japanese
+# title: GPTSAN Japanese
+# - local: model_doc/gpt-sw3
+# title: GPTSw3
+# - local: model_doc/herbert
+# title: HerBERT
+# - local: model_doc/ibert
+# title: I-BERT
+# - local: model_doc/jamba
+# title: Jamba
+# - local: model_doc/jetmoe
+# title: JetMoe
+# - local: model_doc/jukebox
+# title: Jukebox
+# - local: model_doc/led
+# title: LED
+# - local: model_doc/llama
+# title: LLaMA
+# - local: model_doc/llama2
+# title: Llama2
+# - local: model_doc/llama3
+# title: Llama3
+# - local: model_doc/longformer
+# title: Longformer
+# - local: model_doc/longt5
+# title: LongT5
+# - local: model_doc/luke
+# title: LUKE
+# - local: model_doc/m2m_100
+# title: M2M100
+# - local: model_doc/madlad-400
+# title: MADLAD-400
+# - local: model_doc/mamba
+# title: Mamba
+# - local: model_doc/marian
+# title: MarianMT
+# - local: model_doc/markuplm
+# title: MarkupLM
+# - local: model_doc/mbart
+# title: MBart and MBart-50
+# - local: model_doc/mega
+# title: MEGA
+# - local: model_doc/megatron-bert
+# title: MegatronBERT
+# - local: model_doc/megatron_gpt2
+# title: MegatronGPT2
+# - local: model_doc/mistral
+# title: Mistral
+# - local: model_doc/mixtral
+# title: Mixtral
+# - local: model_doc/mluke
+# title: mLUKE
+# - local: model_doc/mobilebert
+# title: MobileBERT
+# - local: model_doc/mpnet
+# title: MPNet
+# - local: model_doc/mpt
+# title: MPT
+# - local: model_doc/mra
+# title: MRA
+# - local: model_doc/mt5
+# title: MT5
+# - local: model_doc/mvp
+# title: MVP
+# - local: model_doc/nezha
+# title: NEZHA
+# - local: model_doc/nllb
+# title: NLLB
+# - local: model_doc/nllb-moe
+# title: NLLB-MoE
+# - local: model_doc/nystromformer
+# title: Nyströmformer
+# - local: model_doc/olmo
+# title: OLMo
+# - local: model_doc/open-llama
+# title: Open-Llama
+# - local: model_doc/opt
+# title: OPT
+# - local: model_doc/pegasus
+# title: Pegasus
+# - local: model_doc/pegasus_x
+# title: PEGASUS-X
+# - local: model_doc/persimmon
+# title: Persimmon
+# - local: model_doc/phi
+# title: Phi
+# - local: model_doc/phi3
+# title: Phi-3
+# - local: model_doc/phobert
+# title: PhoBERT
+# - local: model_doc/plbart
+# title: PLBart
+# - local: model_doc/prophetnet
+# title: ProphetNet
+# - local: model_doc/qdqbert
+# title: QDQBert
+# - local: model_doc/qwen2
+# title: Qwen2
+# - local: model_doc/qwen2_moe
+# title: Qwen2MoE
+# - local: model_doc/rag
+# title: RAG
+# - local: model_doc/realm
+# title: REALM
+# - local: model_doc/recurrent_gemma
+# title: RecurrentGemma
+# - local: model_doc/reformer
+# title: Reformer
+# - local: model_doc/rembert
+# title: RemBERT
+# - local: model_doc/retribert
+# title: RetriBERT
+# - local: model_doc/roberta
+# title: RoBERTa
+# - local: model_doc/roberta-prelayernorm
+# title: RoBERTa-PreLayerNorm
+# - local: model_doc/roc_bert
+# title: RoCBert
+# - local: model_doc/roformer
+# title: RoFormer
+# - local: model_doc/rwkv
+# title: RWKV
+# - local: model_doc/splinter
+# title: Splinter
+# - local: model_doc/squeezebert
+# title: SqueezeBERT
+# - local: model_doc/stablelm
+# title: StableLm
+# - local: model_doc/starcoder2
+# title: Starcoder2
+# - local: model_doc/switch_transformers
+# title: SwitchTransformers
+# - local: model_doc/t5
+# title: T5
+# - local: model_doc/t5v1.1
+# title: T5v1.1
+# - local: model_doc/tapex
+# title: TAPEX
+# - local: model_doc/transfo-xl
+# title: Transformer XL
+# - local: model_doc/ul2
+# title: UL2
+# - local: model_doc/umt5
+# title: UMT5
+# - local: model_doc/xmod
+# title: X-MOD
+# - local: model_doc/xglm
+# title: XGLM
+# - local: model_doc/xlm
+# title: XLM
+# - local: model_doc/xlm-prophetnet
+# title: XLM-ProphetNet
+# - local: model_doc/xlm-roberta
+# title: XLM-RoBERTa
+# - local: model_doc/xlm-roberta-xl
+# title: XLM-RoBERTa-XL
+# - local: model_doc/xlm-v
+# title: XLM-V
+# - local: model_doc/xlnet
+# title: XLNet
+# - local: model_doc/yoso
+# title: YOSO
+# title: Text models
+# - isExpanded: false
+# sections:
+# - local: model_doc/beit
+# title: BEiT
+# - local: model_doc/bit
+# title: BiT
+# - local: model_doc/conditional_detr
+# title: Conditional DETR
+# - local: model_doc/convnext
+# title: ConvNeXT
+# - local: model_doc/convnextv2
+# title: ConvNeXTV2
+# - local: model_doc/cvt
+# title: CVT
+# - local: model_doc/deformable_detr
+# title: Deformable DETR
+# - local: model_doc/deit
+# title: DeiT
+# - local: model_doc/depth_anything
+# title: Depth Anything
+# - local: model_doc/deta
+# title: DETA
+# - local: model_doc/detr
+# title: DETR
+# - local: model_doc/dinat
+# title: DiNAT
+# - local: model_doc/dinov2
+# title: DINOV2
+# - local: model_doc/dit
+# title: DiT
+# - local: model_doc/dpt
+# title: DPT
+# - local: model_doc/efficientformer
+# title: EfficientFormer
+# - local: model_doc/efficientnet
+# title: EfficientNet
+# - local: model_doc/focalnet
+# title: FocalNet
+# - local: model_doc/glpn
+# title: GLPN
+# - local: model_doc/imagegpt
+# title: ImageGPT
+# - local: model_doc/levit
+# title: LeViT
+# - local: model_doc/mask2former
+# title: Mask2Former
+# - local: model_doc/maskformer
+# title: MaskFormer
+# - local: model_doc/mobilenet_v1
+# title: MobileNetV1
+# - local: model_doc/mobilenet_v2
+# title: MobileNetV2
+# - local: model_doc/mobilevit
+# title: MobileViT
+# - local: model_doc/mobilevitv2
+# title: MobileViTV2
+# - local: model_doc/nat
+# title: NAT
+# - local: model_doc/poolformer
+# title: PoolFormer
+# - local: model_doc/pvt
+# title: Pyramid Vision Transformer (PVT)
+# - local: model_doc/pvt_v2
+# title: Pyramid Vision Transformer v2 (PVTv2)
+# - local: model_doc/regnet
+# title: RegNet
+# - local: model_doc/resnet
+# title: ResNet
+# - local: model_doc/segformer
+# title: SegFormer
+# - local: model_doc/seggpt
+# title: SegGpt
+# - local: model_doc/superpoint
+# title: SuperPoint
+# - local: model_doc/swiftformer
+# title: SwiftFormer
+# - local: model_doc/swin
+# title: Swin Transformer
+# - local: model_doc/swinv2
+# title: Swin Transformer V2
+# - local: model_doc/swin2sr
+# title: Swin2SR
+# - local: model_doc/table-transformer
+# title: Table Transformer
+# - local: model_doc/upernet
+# title: UperNet
+# - local: model_doc/van
+# title: VAN
+# - local: model_doc/vit
+# title: Vision Transformer (ViT)
+# - local: model_doc/vit_hybrid
+# title: ViT Hybrid
+# - local: model_doc/vitdet
+# title: ViTDet
+# - local: model_doc/vit_mae
+# title: ViTMAE
+# - local: model_doc/vitmatte
+# title: ViTMatte
+# - local: model_doc/vit_msn
+# title: ViTMSN
+# - local: model_doc/yolos
+# title: YOLOS
+# title: Vision models
+# - isExpanded: false
+# sections:
+# - local: model_doc/audio-spectrogram-transformer
+# title: Audio Spectrogram Transformer
+# - local: model_doc/bark
+# title: Bark
+# - local: model_doc/clap
+# title: CLAP
+# - local: model_doc/encodec
+# title: EnCodec
+# - local: model_doc/hubert
+# title: Hubert
+# - local: model_doc/mctct
+# title: MCTCT
+# - local: model_doc/mms
+# title: MMS
+# - local: model_doc/musicgen
+# title: MusicGen
+# - local: model_doc/musicgen_melody
+# title: MusicGen Melody
+# - local: model_doc/pop2piano
+# title: Pop2Piano
+# - local: model_doc/seamless_m4t
+# title: Seamless-M4T
+# - local: model_doc/seamless_m4t_v2
+# title: SeamlessM4T-v2
+# - local: model_doc/sew
+# title: SEW
+# - local: model_doc/sew-d
+# title: SEW-D
+# - local: model_doc/speech_to_text
+# title: Speech2Text
+# - local: model_doc/speech_to_text_2
+# title: Speech2Text2
+# - local: model_doc/speecht5
+# title: SpeechT5
+# - local: model_doc/unispeech
+# title: UniSpeech
+# - local: model_doc/unispeech-sat
+# title: UniSpeech-SAT
+# - local: model_doc/univnet
+# title: UnivNet
+# - local: model_doc/vits
+# title: VITS
+# - local: model_doc/wav2vec2
+# title: Wav2Vec2
+# - local: model_doc/wav2vec2-bert
+# title: Wav2Vec2-BERT
+# - local: model_doc/wav2vec2-conformer
+# title: Wav2Vec2-Conformer
+# - local: model_doc/wav2vec2_phoneme
+# title: Wav2Vec2Phoneme
+# - local: model_doc/wavlm
+# title: WavLM
+# - local: model_doc/whisper
+# title: Whisper
+# - local: model_doc/xls_r
+# title: XLS-R
+# - local: model_doc/xlsr_wav2vec2
+# title: XLSR-Wav2Vec2
+# title: Audio models
+# - isExpanded: false
+# sections:
+# - local: model_doc/timesformer
+# title: TimeSformer
+# - local: model_doc/videomae
+# title: VideoMAE
+# - local: model_doc/vivit
+# title: ViViT
+# title: Video models
+# - isExpanded: false
+# sections:
+# - local: model_doc/align
+# title: ALIGN
+# - local: model_doc/altclip
+# title: AltCLIP
+# - local: model_doc/blip
+# title: BLIP
+# - local: model_doc/blip-2
+# title: BLIP-2
+# - local: model_doc/bridgetower
+# title: BridgeTower
+# - local: model_doc/bros
+# title: BROS
+# - local: model_doc/chinese_clip
+# title: Chinese-CLIP
+# - local: model_doc/clip
+# title: CLIP
+# - local: model_doc/clipseg
+# title: CLIPSeg
+# - local: model_doc/clvp
+# title: CLVP
+# - local: model_doc/data2vec
+# title: Data2Vec
+# - local: model_doc/deplot
+# title: DePlot
+# - local: model_doc/donut
+# title: Donut
+# - local: model_doc/flava
+# title: FLAVA
+# - local: model_doc/git
+# title: GIT
+# - local: model_doc/grounding-dino
+# title: Grounding DINO
+# - local: model_doc/groupvit
+# title: GroupViT
+# - local: model_doc/idefics
+# title: IDEFICS
+# - local: model_doc/idefics2
+# title: Idefics2
+# - local: model_doc/instructblip
+# title: InstructBLIP
+# - local: model_doc/kosmos-2
+# title: KOSMOS-2
+# - local: model_doc/layoutlm
+# title: LayoutLM
+# - local: model_doc/layoutlmv2
+# title: LayoutLMV2
+# - local: model_doc/layoutlmv3
+# title: LayoutLMV3
+# - local: model_doc/layoutxlm
+# title: LayoutXLM
+# - local: model_doc/lilt
+# title: LiLT
+# - local: model_doc/llava
+# title: Llava
+# - local: model_doc/llava_next
+# title: LLaVA-NeXT
+# - local: model_doc/lxmert
+# title: LXMERT
+# - local: model_doc/matcha
+# title: MatCha
+# - local: model_doc/mgp-str
+# title: MGP-STR
+# - local: model_doc/nougat
+# title: Nougat
+# - local: model_doc/oneformer
+# title: OneFormer
+# - local: model_doc/owlvit
+# title: OWL-ViT
+# - local: model_doc/owlv2
+# title: OWLv2
+# - local: model_doc/paligemma
+# title: PaliGemma
+# - local: model_doc/perceiver
+# title: Perceiver
+# - local: model_doc/pix2struct
+# title: Pix2Struct
+# - local: model_doc/sam
+# title: Segment Anything
+# - local: model_doc/siglip
+# title: SigLIP
+# - local: model_doc/speech-encoder-decoder
+# title: Speech Encoder Decoder Models
+# - local: model_doc/tapas
+# title: TAPAS
+# - local: model_doc/trocr
+# title: TrOCR
+# - local: model_doc/tvlt
+# title: TVLT
+# - local: model_doc/tvp
+# title: TVP
+# - local: model_doc/udop
+# title: UDOP
+# - local: model_doc/video_llava
+# title: VideoLlava
+# - local: model_doc/vilt
+# title: ViLT
+# - local: model_doc/vipllava
+# title: VipLlava
+# - local: model_doc/vision-encoder-decoder
+# title: Vision Encoder Decoder Models
+# - local: model_doc/vision-text-dual-encoder
+# title: Vision Text Dual Encoder
+# - local: model_doc/visual_bert
+# title: VisualBERT
+# - local: model_doc/xclip
+# title: X-CLIP
+# title: Multimodal models
+# - isExpanded: false
+# sections:
+# - local: model_doc/decision_transformer
+# title: محول القرار
+# - local: model_doc/trajectory_transformer
+# title: محول المسار
+# title: نماذج التعلم التعزيزية
+# - isExpanded: false
+# sections:
+# - local: model_doc/autoformer
+# title: Autoformer
+# - local: model_doc/informer
+# title: Informer
+# - local: model_doc/patchtsmixer
+# title: PatchTSMixer
+# - local: model_doc/patchtst
+# title: PatchTST
+# - local: model_doc/time_series_transformer
+# title: محول السلاسل الزمنية
+# title: نماذج السلاسل الزمنية
+# - isExpanded: false
+# sections:
+# - local: model_doc/graphormer
+# title: Graphormer
+# title: نماذج الرسم البياني
+# title: النماذج
+# - sections:
+# - local: internal/modeling_utils
+# title: الطبقات المخصصة والمرافق
+# - local: internal/pipelines_utils
+# title: مرافق خطوط الأنابيب
+# - local: internal/tokenization_utils
+# title: مرافق مقسم النصوص
+# - local: internal/trainer_utils
+# title: مرافق المدرب
+# - local: internal/generation_utils
+# title: مرافق التوليد
+# - local: internal/image_processing_utils
+# title: مرافق معالجة الصور
+# - local: internal/audio_utils
+# title: مرافق معالجة الصوت
+# - local: internal/file_utils
+# title: مرافق عامة
+# - local: internal/time_series_utils
+# title: مرافق السلاسل الزمنية
+# title: مساعدون داخليون
+# title: API
diff --git a/docs/source/ar/accelerate.md b/docs/source/ar/accelerate.md
new file mode 100644
index 00000000000000..486c1efe59af60
--- /dev/null
+++ b/docs/source/ar/accelerate.md
@@ -0,0 +1,120 @@
+# التدريب الموزع باستخدام 🤗 Accelerate
+
+
+مع تزايد حجم النماذج اللغوية، برز التوازي كأحد الاستراتيجيات لتدريب نماذج أكبر على أجهزة محدودة وتسريع عملية التدريب بمقدار كبير. أنشأنا في Hugging Face، قمنا بإنشاء مكتبة [ Accelerate](https://huggingface.co/docs/accelerate) لمساعدة المستخدمين على تدريب أي نموذج من Transformers بسهولة على أي نوع من الإعدادات الموزعة، سواء كان ذلك على عدة وحدات معالجة رسومات (GPUs) على جهاز واحد أو على عدة وحدات معالجة رسومات موزعة على عدة أجهزة. في هذا الدليل، تعلم كيفية تخصيص حلقة تدريب PyTorch الأصلية لتمكين التدريب في بيئة موزعة.
+
+## الإعداد
+
+ابدأ بتثبيت 🤗 Accelerate:
+
+```bash
+pip install accelerate
+```
+
+ثم قم باستيراد وإنشاء كائن [`~accelerate.Accelerator`]. سيقوم [`~accelerate.Accelerator`] تلقائيًا باكتشاف نوع الإعداد الموزع الخاص بك وتهيئة جميع المكونات اللازمة للتدريب. لن تحتاج إلى وضع نموذجك على جهاز بشكل معين.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## الاستعداد للتسريع
+
+الخطوة التالية هي تمرير جميع كائنات التدريب ذات الصلة إلى دالة الإعداد [`~accelerate.Accelerator.prepare`]. ويشمل ذلك DataLoaders للتدريب والتقييم، ونموذجًا ومُحَسِّنً المعاملات (optimizer):
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+... train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## الخلفي Backward
+
+الإضافة الأخيرة هي استبدال الدالة المعتادة `loss.backward()` في حلقة التدريب الخاصة بك بدالة [`~accelerate.Accelerator.backward`] في 🤗 Accelerate:
+
+```py
+>>> for epoch in range(num_epochs):
+... for batch in train_dataloader:
+... outputs = model(**batch)
+... loss = outputs.loss
+... accelerator.backward(loss)
+
+... optimizer.step()
+... lr_scheduler.step()
+... optimizer.zero_grad()
+... progress_bar.update(1)
+```
+
+كما يمكنك أن ترى في الكود التالي، فأنت بحاجة فقط إلى إضافة أربعة أسطر من الكود إلى حلقة التدريب الخاصة بك لتمكين التدريب الموزع!
+
+```diff
++ from accelerate import Accelerator
+ from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+ model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+ optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++ train_dataloader, eval_dataloader, model, optimizer
++ )
+
+ num_epochs = 3
+ num_training_steps = num_epochs * len(train_dataloader)
+ lr_scheduler = get_scheduler(
+ "linear",
+ optimizer=optimizer,
+ num_warmup_steps=0,
+ num_training_steps=num_training_steps
+ )
+
+ progress_bar = tqdm(range(num_training_steps))
+
+ model.train()
+ for epoch in range(num_epochs):
+ for batch in train_dataloader:
+- batch = {k: v.to(device) for k, v in batch.items()}
+ outputs = model(**batch)
+ loss = outputs.loss
+- loss.backward()
++ accelerator.backward(loss)
+optimizer.step()
+ lr_scheduler.step()
+ optimizer.zero_grad()
+ progress_bar.update(1)
+```
+
+## تدريب
+
+بمجرد إضافة أسطر الكود ذات الصلة، قم بتشغيل التدريب الخاص بك في أحد النصوص أو الدفاتر مثل Colaboratory.
+
+### التدريب باستخدام نص برمجي
+
+إذا كنت تشغل التدريب الخاص بك من نص برمجي، فقم بتشغيل الأمر التالي لإنشاء وحفظ ملف تكوين:
+
+```bash
+accelerate config
+```
+
+ثم قم بتشغيل التدريب الخاص بك باستخدام:
+
+```bash
+accelerate launch train.py
+```
+
+### التدريب باستخدام دفتر ملاحظات
+
+يمكن أيضًا تشغيل 🤗 Accelerate في دفاتر إذا كنت تخطط لاستخدام وحدات معالجة الرسوميات (TPUs) في Colaboratory. قم بتغليف كل الكود المسؤول عن التدريب في دالة، ومررها إلى [`~accelerate.notebook_launcher`]:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+للحصول على مزيد من المعلومات حول 🤗 Accelerate وميزاته الغنية، يرجى الرجوع إلى [الوثائق](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/source/ar/agents.md b/docs/source/ar/agents.md
new file mode 100644
index 00000000000000..92b2a4715f6f07
--- /dev/null
+++ b/docs/source/ar/agents.md
@@ -0,0 +1,539 @@
+# الوكلاء والأدوات
+
+[[open-in-colab]]
+
+### ما هو الوكيل؟
+
+يمكن للنظم اللغوية الكبيرة (LLMs) التي تم تدريبها على أداء [نمذجة اللغة السببية](./tasks/language_modeling.) التعامل مع مجموعة واسعة من المهام، ولكنها غالبًا ما تواجه صعوبات في المهام الأساسية مثل المنطق والحساب والبحث. وعندما يتم استدعاؤها في مجالات لا تؤدي فيها أداءً جيدًا، فإنها غالبًا ما تفشل في توليد الإجابة التي نتوقعها منها.
+
+يتمثل أحد النهج للتغلب على هذا القصور في إنشاء "وكيل".
+
+الوكيل هو نظام يستخدم LLM كمحرك له، ولديه حق الوصول إلى وظائف تسمى "أدوات".
+
+هذه "الأدوات" هي وظائف لأداء مهمة، وتحتوي على جميع الأوصاف اللازمة للوكيل لاستخدامها بشكل صحيح.
+
+يمكن برمجة الوكيل للقيام بما يلي:
+- وضع سلسلة من الإجراءات/الأدوات وتشغيلها جميعًا في نفس الوقت مثل [`CodeAgent`] على سبيل المثال
+- التخطيط للاجراءات/الأدوات وتنفيذها واحدة تلو الأخرى والانتظار حتى انتهاء كل إجراء قبل إطلاق التالي مثل [`ReactJsonAgent`] على سبيل المثال
+
+### أنواع الوكلاء
+
+#### الوكيل البرمجي (Code agent)
+
+يتمتع هذا الوكيل يتبع خطوات محددة: أولًا، يخطط لسلسلة من الإجراءات التي يريد تنفيذها، ثم شفرة Python لتنفيذ جميع الإجراءات في نفس الوقت. وهو يتعامل بشكل أصلي مع أنواع مختلفة من المدخلات والمخرجات للأدوات التي يستخدمها، وبالتالي فهو الخيار الموصى به للمهام متعددة الوسائط.
+
+#### وكلاء التفاعل
+
+هذا هو الوكيل الذي يتم اللجوء إليه لحل مهام الاستدلال، حيث يجعل إطار ReAct ([Yao et al.، 2022](https://huggingface.co/papers/2210.03629)) من الكفاءة حقًا التفكير على أساس ملاحظاته السابقة.
+
+نقوم بتنفيذ إصدارين من ReactJsonAgent:
+- [`ReactJsonAgent`] يقوم بتوليد استدعاءات الأدوات كـ JSON في إخراجها.
+- [`ReactCodeAgent`] هو نوع جديد من ReactJsonAgent يقوم بتوليد استدعاءات أدواته كمقاطع من التعليمات البرمجية، والتي تعمل بشكل جيد حقًا مع LLMs التي تتمتع بأداء قوي في البرمجة.
+
+> [!TIP]
+> اقرأ منشور المدونة [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) لمعرفة المزيد عن وكيل ReAct.
+
+![إطار عمل وكيل ReAct](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
+
+على سبيل المثال، إليك كيف يعمل وكيل ReAct Code طريقه من خلال السؤال التالي.
+
+```py3
+>>> agent.run(
+... "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
+... )
+=====New task=====
+How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
+====Agent is executing the code below:
+bert_blocks = search(query="number of blocks in BERT base encoder")
+print("BERT blocks:", bert_blocks)
+====
+Print outputs:
+BERT blocks: twelve encoder blocks
+
+====Agent is executing the code below:
+attention_layer = search(query="number of layers in Attention is All You Need")
+print("Attention layers:", attention_layer)
+====
+Print outputs:
+Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
+
+====Agent is executing the code below:
+bert_blocks = 12
+attention_layers = 6
+diff = bert_blocks - attention_layers
+print("Difference in blocks:", diff)
+final_answer(diff)
+====
+
+Print outputs:
+Difference in blocks: 6
+
+Final answer: 6
+```
+
+### كيف يمكنني بناء وكيل؟
+
+لتهيئة وكيل، تحتاج إلى هذه الوسائط:
+
+- نموذج لغوي كبير (LLM) يشكل المحرك الأساسي للوكيل. الوكيل نفسه ليس النموذج اللغوي، بل هو برنامج يستخدم النموذج اللغوي كمحرك له.
+- موجه النظام (system prompt): هذه هي التعليمات التي يتم إعطاؤها للنموذج اللغوي لإنشاء مخرجاته.
+- صندوق أدوات (toolbox) يختار الوكيل منه الأدوات لتنفيذها
+- محلل (parser) لاستخراج الأدوات التي يجب استدعاؤها من مخرجات النموذج اللغوي LLM والأدوات التي يجب استخدامها
+
+عند تهيئة نظام الوكيل، يتم استخدام سمات الأداة لإنشاء وصف للأداة، ثم يتم دمجها في موجه النظام الخاص `system_prompt` للوكيل لإعلامه بالأدوات التي يمكنه استخدامها ولماذا.
+
+للبدء، يرجى تثبيت `agents` الإضافية لتثبيت جميع التبعيات الافتراضية.
+
+```bash
+pip install transformers[agents]
+```
+
+قم ببناء محرك LLM الخاص بك من خلال تعريف طريقة `llm_engine` التي تقبل قائمة من [الرسائل](./chat_templating.) وتعيد النص. يجب أن تقبل هذه الدالة القابلة للاستدعاء أيضًا معامل `stop` يشير إلى متى يجب التوقف عن التوليد.
+
+```python
+from huggingface_hub import login, InferenceClient
+
+login("")
+
+client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct")
+
+def llm_engine(messages, stop_sequences=["Task"]) -> str:
+ response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
+ answer = response.choices[0].message.content
+ return answer
+```
+
+يمكنك استخدام أي طريقة `llm_engine` طالما أنها:
+1. يتبع تنسيق [رسائل](./chat_templating.md) لإدخاله (`List [Dict [str، str]]`) ويعيد `str`
+2. يتوقف عن توليد المخراجات من التسلسلات التي تم تمريرها في معامل `stop`
+
+أنت بحاجة أيضًا إلى معامل "الأدوات" الذي يقبل قائمة من "الأدوات". يمكنك توفير قائمة فارغة لـ "الأدوات"، ولكن استخدم صندوق الأدوات الافتراضي مع معامل اختياري `add_base_tools=True`.
+
+الآن يمكنك إنشاء وكيل، مثل [`CodeAgent`], وتشغيله. ولتسهيل الأمر، نقدم أيضًا فئة [`HfEngine`] التي تستخدم `huggingface_hub.InferenceClient` بشكل مخفى.
+
+```python
+from transformers import CodeAgent, HfEngine
+
+llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run(
+ "Could you translate this sentence from French, say it out loud and return the audio.",
+ sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+هذه الميزة ستكون مفيدة في حالة الحاجة الملحة! يمكنك حتى ترك معامل `llm_engine` غير محدد، وسيتم إنشاء [`HfEngine`] بشكل تلقائي.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], add_base_tools=True)
+
+agent.run(
+ "Could you translate this sentence from French, say it out loud and give me the audio.",
+ sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+لاحظ أننا استخدمنا معامل "sentence" إضافي: يمكنك تمرير النص كمعامل إضافي إلى النموذج.
+
+يمكنك أيضًا استخدام هذا للإشارة إلى مسار الملفات المحلية أو البعيدة للنموذج لاستخدامها:
+
+```py
+from transformers import ReactCodeAgent
+
+agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
+```
+
+
+تم تحديد موجه النظام ومحلل المخرجات تلقائيًا، ولكن يمكنك فحصهما بسهولة عن طريق استدعاء `system_prompt_template` على وكيلك.
+
+```python
+print(agent.system_prompt_template)
+```
+
+من المهم أن تشرح بأكبر قدر ممكن من الوضوح المهمة التي تريد تنفيذها.
+كل عملية [`~Agent.run`] مستقلة، وبما أن الوكيل مدعوم من LLM، فقد تؤدي الاختلافات الطفيفة في موجهك إلى نتائج مختلفة تمامًا.
+يمكنك أيضًا تشغيل وكيل بشكل متتالي لمهام مختلفة: في كل مرة يتم فيها إعادة تهيئة سمتي `agent.task` و`agent.logs`.
+
+
+#### تنفيذ التعليمات البرمجية
+
+يقوم مفسر Python بتنفيذ التعليمات البرمجية على مجموعة من المدخلات التي يتم تمريرها جنبًا إلى جنب مع أدواتك.
+يجب أن يكون هذا الأمر آمنًا لأن الوظائف الوحيدة التي يمكن استدعاؤها هي الأدوات التي قدمتها (خاصة إذا كانت أدوات من Hugging Face فقط) ووظيفة الطباعة، لذا فأنت مقيد بالفعل بما يمكن تنفيذه.
+
+مفسر Python لا يسمح أيضًا باستدعاء دوال بشكل افتراضي خارج قائمة آمنة، لذا فإن جميع الهجمات الأكثر وضوحًا لا ينبغي أن تكون مشكلة.
+يمكنك أيضًا الإذن باستيرادات إضافية عن طريق تمرير الوحدات النمطية المصرح بها كقائمة من السلاسل في معامل `additional_authorized_imports` عند تهيئة [`ReactCodeAgent`] أو [`CodeAgent`]:
+
+```py
+>>> from transformers import ReactCodeAgent
+
+>>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
+>>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
+
+(...)
+'Hugging Face – Blog'
+```
+
+سيتم إيقاف التنفيذ عند أي رمز يحاول تنفيذ عملية غير قانونية أو إذا كان هناك خطأ Python عادي في التعليمات البرمجية التي تم إنشاؤها بواسطة الوكيل.
+
+> [!WARNING]
+> يمكن لـ LLM توليد شفرة برمجية عشوائية سيتم تنفيذها بعد ذلك: لا تقمب استدعاء أى دوال غير آمنة!
+
+### موجه النظام
+
+ينشئ الوكيل، أو بالأحرى LLM الذي يقود الوكيل، يولد مخرجات بناءً على موجه النظام. يمكن تخصيص موجه النظام وتصميمه للمهام المقصودة. على سبيل المثال، تحقق من موجه النظام لـ [`ReactCodeAgent`] (الإصدار أدناه مبسط قليلاً).
+
+```text
+You will be given a task to solve as best you can.
+You have access to the following tools:
+<>
+
+To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
+
+At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
+Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence.
+During each intermediate step, you can use 'print()' to save whatever important information you will then need.
+These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
+
+In the end you have to return a final answer using the `final_answer` tool.
+
+Here are a few examples using notional tools:
+---
+{examples}
+
+Above example were using notional tools that might not exist for you. You only have acces to those tools:
+<>
+You also can perform computations in the python code you generate.
+
+Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```' sequence. You MUST provide at least the 'Code:' sequence to move forward.
+
+Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
+Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
+
+Remember to make sure that variables you use are all defined.
+
+Now Begin!
+```
+
+يتضمن موجه النظام:
+- *مقدمة* تشرح كيف يجب أن يتصرف الوكيل والأدوات التي يجب عليه استخدامها.
+- وصف لجميع الأدوات التي يتم تحديدها بواسطة رمز `<>` الذي يتم استبداله ديناميكيًا في وقت التشغيل بالأدوات التي يحددها المستخدم أو يختارها.
+ - يأتي وصف الأداة من سمات الأداة، `name`، و`description`، و`inputs` و`output_type`، وقالب `jinja2` بسيط يمكنك تحسينه.
+- شكل المخرج المتوقع.
+
+يمكنك تحسين موجه النظام، على سبيل المثال، عن طريق إضافة شرح لتنسيق المخرجات.
+
+للحصول على أقصى قدر من المرونة، يمكنك الكتابة فوق قالب موجه النظام بالكامل عن طريق تمرير موجه مخصص كمعامل إلى معلمة `system_prompt`.
+
+```python
+from transformers import ReactJsonAgent
+from transformers.agents import PythonInterpreterTool
+
+agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
+```
+
+> [!WARNING]
+> يرجى التأكد من تحديد سلسلة `<>` في مكان ما في `template` حتى يكون الوكيل على علم
+بالأدوات المتاحة.
+
+
+### فحص تشغيل الوكيل
+
+فيما يلي بعض السمات المفيدة لفحص ما حدث بعد التشغيل:
+- تخزن `agent.logs` سجلات مفصلة للوكيل. في كل خطوة من تشغيل الوكيل، يتم تخزين كل شيء في قاموس إلحاقه بـ `agent.logs`.
+- تشغيل `agent.write_inner_memory_from_logs()` يخلق ذاكرة داخلية لسجلات الوكيل للنظام LLM لعرضها، كقائمة من رسائل الدردشة. تنتقل هذه الطريقة عبر كل خطوة من سجل الوكيل ولا تخزن سوى ما يهمها كرسالة: على سبيل المثال، سيحفظ موجه النظام والمهمة في رسائل منفصلة، ثم لكل خطوة سيخزن مخرج LLM كرسالة، ومخرج استدعاء الأداة كرسالة أخرى. استخدم هذا إذا كنت تريد عرضًا عامًا لما حدث - ولكن لن يتم نسخ كل سجل بواسطة هذه الطريقة.
+
+## الأدوات
+
+الأداة هي عبارة عن وظيفة أساسية يستخدمها الوكيل لتنفيذ مهمة محددة.
+
+يمكنك على سبيل المثال التحقق من [`PythonInterpreterTool`]: لديه اسم ووصف ووصف للمدخلات ونوع للمخرج، وطريقة `__call__` التي تقوم بتنفيذ المهمة المطلوبة.
+
+عند تهيئة الوكيل، يتم استخدام سمات الأداة لتوليد وصف للأداة يتم تضمينه في موجه النظام الخاص بالوكيل. يتيح هذا للوكيل معرفة الأدوات التي يمكنه استخدامها ولماذا.
+
+### صندوق الأدوات الافتراضي
+
+يأتي Transformers مع صندوق أدوات افتراضي لتمكين الوكلاء، والذي يمكنك إضافته إلى وكيلك عند التهيئة باستخدام معامل `add_base_tools = True`:
+
+- **الإجابة على أسئلة المستند**: الإجابة على سؤال حول المستند (مثل ملف PDF) بتنسيق صورة ([Donut](./model_doc/donut))
+- **الإجابة على أسئلة الصور**: الإجابة على سؤال حول صورة ([VILT](./model_doc/vilt))
+- **التحدث إلى النص**: قم بتفريغ الكلام إلى نص ([Whisper](./model_doc/whisper))
+- **النص إلى كلام**: تحويل النص إلى كلام ([SpeechT5](./model_doc/speecht5))
+- **الترجمة**: ترجمة جملة معينة من لغة المصدر إلى لغة الهدف.
+- **مفسر كود Python**: تشغيل كود Python الذي تم إنشاؤه بواسطة LLM في بيئة آمنة. لن يتم إضافة هذه الأداة إلى [`ReactJsonAgent`] إلا إذا استخدمت `add_base_tools=True`، نظرًا لأن الأدوات المستندة إلى التعليمات البرمجية يمكنها بالفعل تنفيذ كود Python
+لا تترجم النصوص الخاصة ولا الأكواد البرمجية ولا الروابط ولا رموز HTML وCSS:
+
+يمكنك استخدام أداة يدويًا عن طريق استدعاء دالة [`load_tool`] وتحديد مهمة لتنفيذها.
+
+```python
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### إنشاء أداة جديدة
+
+يمكنك إنشاء أداتك الخاصة لتغطية حالات الاستخدام التي لا تغطيها الأدوات الافتراضية من Hugging Face.
+على سبيل المثال، دعنا نقوم بإنشاء أداة تعرض النموذج الأكثر تنزيلًا لمهمة معينة من Hub.
+
+سوف نبدأ بالكود التالي.
+
+```python
+from huggingface_hub import list_models
+
+task = "text-classification"
+
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+
+يمكن تحويل هذه الشيفرة إلى فئة ترث من الفئة العليا [`Tool`].
+
+تحتاج الأداة المخصصة إلى:
+
+- اسم `name`، والتي تمثل اسم الأداة نفسها. عادةً ما يصف الاسم وظيفتها. بما أن الكود يعيد النموذج الأكثر تنزيلًا لمهمة ما، فلنسمها `model_download_counter`.
+- تستخدم خاصية `description` لملء موجه نظام الوكيل.
+- خاصية `inputs`، والتي هي عبارة عن قاموس بمفاتيح "type" و"description". يحتوي على معلومات تساعد المفسر Python على اتخاذ خيارات مستنيرة بشأن المدخلات.
+- خاصية `output_type`، والتي تحدد نوع المخرج.
+- طريقة `forward` والتي تحتوي على الكود الذي سيتم تنفيذه للحصول على النتيجة النهائية.
+
+```python
+from transformers import Tool
+from huggingface_hub import list_models
+
+class HFModelDownloadsTool(Tool):
+ name = "model_download_counter"
+ description = (
+ "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
+ "It returns the name of the checkpoint."
+ )
+
+ inputs = {
+ "task": {
+ "type": "text",
+ "description": "the task category (such as text-classification, depth-estimation, etc)",
+ }
+ }
+ output_type = "text"
+
+ def forward(self, task: str):
+ model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+ return model.id
+```
+
+الآن بعد أن أصبحت فئة `HfModelDownloadsTool` المخصصة جاهزة، يمكنك حفظها في ملف باسم `model_downloads.py` واستيرادها للاستخدام.
+
+```python
+from model_downloads import HFModelDownloadsTool
+
+tool = HFModelDownloadsTool()
+```
+
+يمكنك أيضًا مشاركة أداتك المخصصة في Hub عن طريق استدعاء [`~Tool.push_to_hub`] على الأداة. تأكد من أنك قمت بإنشاء مستودع لها على Hub وأنك تستخدم رمز وصول للقراءة.
+
+```python
+tool.push_to_hub("{your_username}/hf-model-downloads")
+```
+
+قم بتحميل الأداة باستخدام دالة [`~Tool.load_tool`] ومررها إلى معلمة `tools` في الوكيل الخاص بك.
+
+```python
+from transformers import load_tool, CodeAgent
+
+model_download_tool = load_tool("m-ric/hf-model-downloads")
+agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
+agent.run(
+ "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+
+ستحصل على ما يلي:
+
+```text
+======== New task ========
+Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
+==== Agent is executing the code below:
+most_downloaded_model = model_download_counter(task="text-to-video")
+print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
+====
+```
+
+والناتج:
+
+`"النموذج الأكثر تنزيلًا لمهمة `text-to-video` هو ByteDance/AnimateDiff-Lightning."`
+
+### إدارة صندوق أدوات الوكيل الخاص بك
+
+إذا كنت قد قمت بتهيئة وكيل، فمن غير الملائم إعادة تهيئته من البداية لإضافة أداة جديدة ترغب في استخدامها. باستخدام مكتبة Transformers، يمكنك إدارة صندوق أدوات الوكيل بإضافة أو استبدال أداة موجودة.
+
+دعنا نضيف الأداة `model_download_tool` إلى وكيل تم تهيئته مسبقًا باستخدام صندوق الأدوات الافتراضي.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+agent.toolbox.add_tool(model_download_tool)
+```
+
+الآن يمكننا الاستفادة من الأداة الجديدة وأداة تحويل النص إلى كلام السابقة:
+
+```python
+ agent.run(
+ "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
+ )
+```
+
+| **Audio** |
+|------------------------------------------------------------------------------------------------------------------------------------------------------|
+|
diff --git a/i18n/README_ar.md b/i18n/README_ar.md
index 60ec4e1c068907..c2dd588fdb233f 100644
--- a/i18n/README_ar.md
+++ b/i18n/README_ar.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_de.md b/i18n/README_de.md
index 7128a9ad999fc7..2532c9e12fab59 100644
--- a/i18n/README_de.md
+++ b/i18n/README_de.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_es.md b/i18n/README_es.md
index fdef68ff1b4c20..6682147d7867cf 100644
--- a/i18n/README_es.md
+++ b/i18n/README_es.md
@@ -44,6 +44,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_fr.md b/i18n/README_fr.md
index 2c78ba041db2f1..c1eaa10edb927d 100644
--- a/i18n/README_fr.md
+++ b/i18n/README_fr.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_hd.md b/i18n/README_hd.md
index a6e017a6f833c1..07077e5dd9c37d 100644
--- a/i18n/README_hd.md
+++ b/i18n/README_hd.md
@@ -69,6 +69,7 @@ checkpoint: जाँच बिंदु
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_ja.md b/i18n/README_ja.md
index 27b770869f7192..293a5ee111b0c7 100644
--- a/i18n/README_ja.md
+++ b/i18n/README_ja.md
@@ -79,6 +79,7 @@ user: ユーザ
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_ko.md b/i18n/README_ko.md
index 283318478f4b1e..e2a9b80d0d3ecc 100644
--- a/i18n/README_ko.md
+++ b/i18n/README_ko.md
@@ -44,6 +44,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_pt-br.md b/i18n/README_pt-br.md
index a356caefba9b42..79007e5aaa33f9 100644
--- a/i18n/README_pt-br.md
+++ b/i18n/README_pt-br.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_ru.md b/i18n/README_ru.md
index fe548c1001149a..759acdbb912771 100644
--- a/i18n/README_ru.md
+++ b/i18n/README_ru.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_te.md b/i18n/README_te.md
index 9dbd522c463db4..feb537ad1a48d2 100644
--- a/i18n/README_te.md
+++ b/i18n/README_te.md
@@ -51,6 +51,7 @@ limitations under the License.
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
جدید ترین مشین لرننگ برائے JAX، PyTorch اور TensorFlow
+
+
+
+
+
+
+🤗 Transformers مختلف طریقوں جیسے کہ متن، بصارت، اور آڈیو پر کام کرنے کے لیے ہزاروں پری ٹرینڈ ماڈلز فراہم کرتے ہیں۔
+
+یہ ماڈلز درج ذیل پر لاگو کیے جا سکتے ہیں:
+
+* 📝 متن، جیسے کہ متن کی درجہ بندی، معلومات کا استخراج، سوالات کے جوابات، خلاصہ، ترجمہ، اور متن کی تخلیق، 100 سے زائد زبانوں میں۔
+* 🖼️ تصاویر، جیسے کہ تصویر کی درجہ بندی، اشیاء کی شناخت، اور تقسیم۔
+* 🗣️ آڈیو، جیسے کہ تقریر کی شناخت اور آڈیو کی درجہ بندی۔
+
+ٹرانسفارمر ماڈلز **مختلف طریقوں کو ملا کر** بھی کام انجام دے سکتے ہیں، جیسے کہ ٹیبل سوال جواب، بصری حروف کی شناخت، اسکین شدہ دستاویزات سے معلومات نکالنا، ویڈیو کی درجہ بندی، اور بصری سوال جواب۔
+
+🤗 Transformers ایسے APIs فراہم کرتا ہے جو آپ کو تیز رفتاری سے پری ٹرینڈ ماڈلز کو ایک دیے گئے متن پر ڈاؤن لوڈ اور استعمال کرنے، انہیں اپنے ڈیٹا سیٹس پر فائن ٹون کرنے، اور پھر ہمارے [ماڈل حب](https://huggingface.co/models) پر کمیونٹی کے ساتھ شیئر کرنے کی سہولت دیتا ہے۔ اسی وقت، ہر پائتھن ماڈیول جو ایک آرکیٹیکچر کو بیان کرتا ہے، مکمل طور پر خود مختار ہوتا ہے اور اسے تیز تحقیقاتی تجربات کے لیے تبدیل کیا جا سکتا ہے۔
+
+
+🤗 Transformers تین سب سے مشہور ڈیپ لرننگ لائبریریوں — [Jax](https://jax.readthedocs.io/en/latest/)، [PyTorch](https://pytorch.org/) اور [TensorFlow](https://www.tensorflow.org/) — کی مدد سے تیار کردہ ہے، جن کے درمیان بے حد ہموار انضمام ہے۔ اپنے ماڈلز کو ایک کے ساتھ تربیت دینا اور پھر دوسرے کے ساتھ inference کے لیے لوڈ کرنا انتہائی سادہ ہے۔
+
+## آن لائن ڈیمو
+
+آپ ہمارے زیادہ تر ماڈلز کو براہ راست ان کے صفحات پر [ماڈل ہب](https://huggingface.co/models) سے آزما سکتے ہیں۔ ہم عوامی اور نجی ماڈلز کے لیے [ذاتی ماڈل ہوسٹنگ، ورژننگ، اور انفرنس API](https://huggingface.co/pricing) بھی فراہم کرتے ہیں۔
+
+یہاں چند مثالیں ہیں:
+
+قدرتی زبان کی پروسیسنگ میں:
+
+- [BERT کے ساتھ ماسک شدہ الفاظ کی تکمیل](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Electra کے ساتھ نامزد اداروں کی شناخت](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [Mistral کے ساتھ متنی جنریشن](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+- [RoBERTa کے ساتھ قدرتی زبان کی دلیل](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [BART کے ساتھ خلاصہ کاری](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [DistilBERT کے ساتھ سوالات کے جوابات](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [T5 کے ساتھ ترجمہ](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+کمپیوٹر وژن میں:
+- [ViT کے ساتھ امیج کی درجہ بندی](https://huggingface.co/google/vit-base-patch16-224)
+- [DETR کے ساتھ اشیاء کی شناخت](https://huggingface.co/facebook/detr-resnet-50)
+- [SegFormer کے ساتھ سیمانٹک سیگمینٹیشن](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+- [Mask2Former کے ساتھ پینوسٹک سیگمینٹیشن](https://huggingface.co/facebook/mask2former-swin-large-coco-panoptic)
+- [Depth Anything کے ساتھ گہرائی کا اندازہ](https://huggingface.co/docs/transformers/main/model_doc/depth_anything)
+- [VideoMAE کے ساتھ ویڈیو کی درجہ بندی](https://huggingface.co/docs/transformers/model_doc/videomae)
+- [OneFormer کے ساتھ یونیورسل سیگمینٹیشن](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
+
+
+آڈیو:
+- [خودکار تقریر کی پہچان Whisper کے ساتھ](https://huggingface.co/openai/whisper-large-v3)
+- [کلیدی الفاظ کی تلاش Wav2Vec2 کے ساتھ](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- [آڈیو کی درجہ بندی Audio Spectrogram Transformer کے ساتھ](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
+
+ملٹی ماڈل ٹاسک میں:
+
+- [ٹیبل سوال جواب کے لیے TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
+- [ویژول سوال جواب کے لیے ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+- [امیج کیپشننگ کے لیے LLaVa](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
+- [زیرو شاٹ امیج کلاسیفیکیشن کے لیے SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384)
+- [دستاویزی سوال جواب کے لیے LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
+- [زیرو شاٹ ویڈیو کلاسیفیکیشن کے لیے X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
+- [زیرو شاٹ آبجیکٹ ڈیٹیکشن کے لیے OWLv2](https://huggingface.co/docs/transformers/en/model_doc/owlv2)
+- [زیرو شاٹ امیج سیگمنٹیشن کے لیے CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)
+- [خودکار ماسک جنریشن کے لیے SAM](https://huggingface.co/docs/transformers/model_doc/sam)
+
+
+## ٹرانسفارمرز کے 100 منصوبے
+
+🤗 Transformers صرف پیشگی تربیت یافتہ ماڈلز کا ایک ٹول کٹ نہیں ہے: یہ ایک کمیونٹی ہے جو اس کے ارد گرد اور ہیگنگ فیس حب پر تعمیر شدہ منصوبوں کا مجموعہ ہے۔ ہم چاہتے ہیں کہ🤗 Transformers ترقی کاروں، محققین، طلباء، پروفیسرز، انجینئرز، اور ہر کسی کو اپنے خوابوں کے منصوبے بنانے میں مدد فراہم کرے۔
+
+
+🤗 Transformers کے 100,000 ستاروں کی خوشی منانے کے لیے، ہم نے کمیونٹی پر روشنی ڈالنے کا فیصلہ کیا ہے، اور ہم نے [awesome-transformers](./awesome-transformers.md) کا صفحہ بنایا ہے جو 100 شاندار منصوبے درج کرتا ہے جو 🤗 Transformers کے ارد گرد بنائے گئے ہیں۔
+
+اگر آپ کے پاس کوئی ایسا منصوبہ ہے جسے آپ سمجھتے ہیں کہ اس فہرست کا حصہ ہونا چاہیے، تو براہ کرم ایک PR کھولیں تاکہ اسے شامل کیا جا سکے!
+
+## اگر آپ ہیگنگ فیس ٹیم سے حسب ضرورت معاونت تلاش کر رہے ہیں
+
+
+
+
+
+## فوری ٹور
+
+دیے گئے ان پٹ (متن، تصویر، آڈیو، ...) پر ماڈل کو فوری طور پر استعمال کرنے کے لیے، ہم pipeline API فراہم کرتے ہیں۔ پائپ لائنز ایک پیشگی تربیت یافتہ ماڈل کو اس ماڈل کی تربیت کے دوران استعمال ہونے والے پری پروسیسنگ کے ساتھ گروپ کرتی ہیں۔ یہاں یہ ہے کہ مثبت اور منفی متون کی درجہ بندی کے لیے پائپ لائن کو جلدی سے کیسے استعمال کیا جائے:
+
+
+```python
+>>> from transformers import pipeline
+
+# جذبات کے تجزیے کے لیے ایک پائپ لائن مختص کریں
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+دوسری لائن کوڈ پائپ لائن کے ذریعہ استعمال ہونے والے پیشگی تربیت یافتہ ماڈل کو ڈاؤن لوڈ اور کیش کرتی ہے، جبکہ تیسری لائن اسے دیے گئے متن پر جانچتی ہے۔ یہاں، جواب "مثبت" ہے جس کی اعتماد کی شرح 99.97% ہے۔
+
+بہت سے کاموں کے لیے ایک پیشگی تربیت یافتہ pipeline تیار ہے، NLP کے علاوہ کمپیوٹر ویژن اور آواز میں بھی۔ مثال کے طور پر، ہم تصویر میں دریافت شدہ اشیاء کو آسانی سے نکال سکتے ہیں:
+
+
+``` python
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import pipeline
+
+# جذبات کے تجزیے کے لیے ایک پائپ لائن مختص کریں
+>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
+>>> image_data = requests.get(url, stream=True).raw
+>>> image = Image.open(image_data)
+
+>>> object_detector = pipeline('object-detection')
+>>> object_detector(image)
+[{'score': 0.9982201457023621،
+ 'label': 'remote'،
+ 'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}}،
+ {'score': 0.9960021376609802،
+ 'label': 'remote'،
+ 'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}}،
+ {'score': 0.9954745173454285،
+ 'label': 'couch'،
+ 'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}}،
+ {'score': 0.9988006353378296،
+ 'label': 'cat'،
+ 'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}}،
+ {'score': 0.9986783862113953،
+ 'label': 'cat'،
+ 'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
+```
+
+یہاں، ہم کو تصویر میں دریافت شدہ اشیاء کی فہرست ملتی ہے، ہر ایک کے گرد ایک باکس اور اعتماد کا اسکور۔ یہاں اصل تصویر بائیں طرف ہے، اور پیشگوئیاں دائیں طرف ظاہر کی گئی ہیں:
+
+
+
+
+
+
+
+آپ `pipeline` API کی مدد سے معاونت شدہ کاموں کے بارے میں مزید جان سکتے ہیں [اس ٹیوٹوریل](https://huggingface.co/docs/transformers/task_summary) میں۔
+
+
+`pipeline` کے علاوہ، کسی بھی پیشگی تربیت یافتہ ماڈل کو آپ کے دیے گئے کام پر ڈاؤن لوڈ اور استعمال کرنے کے لیے، صرف تین لائنوں کا کوڈ کافی ہے۔ یہاں PyTorch ورژن ہے:
+
+```python
+>>> from transformers import AutoTokenizer، AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!"، return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+اور یہاں TensorFlow کے لیے مساوی کوڈ ہے:
+```python
+>>> from transformers import AutoTokenizer، TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!"، return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+ٹوکینائزر تمام پری پروسیسنگ کا ذمہ دار ہے جس کی پیشگی تربیت یافتہ ماڈل کو ضرورت ہوتی ہے اور اسے براہ راست ایک واحد سٹرنگ (جیسا کہ اوپر کی مثالوں میں) یا ایک فہرست پر کال کیا جا سکتا ہے۔ یہ ایک لغت فراہم کرے گا جسے آپ ڈاؤن اسٹریم کوڈ میں استعمال کر سکتے ہیں یا سادہ طور پر اپنے ماڈل کو ** دلیل انپیکنگ آپریٹر کے ذریعے براہ راست پاس کر سکتے ہیں۔
+
+ماڈل خود ایک باقاعدہ [PyTorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) یا [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (آپ کے بیک اینڈ پر منحصر ہے) ہے جسے آپ معمول کے مطابق استعمال کر سکتے ہیں۔ [یہ ٹیوٹوریل](https://huggingface.co/docs/transformers/training) وضاحت کرتا ہے کہ کلاسیکی PyTorch یا TensorFlow تربیتی لوپ میں ایسے ماڈل کو کیسے ضم کیا جائے، یا ہمارے `Trainer` API کا استعمال کرتے ہوئے نئے ڈیٹا سیٹ پر جلدی سے فائن ٹیون کیسے کیا جائے۔
+
+## مجھے Transformers کیوں استعمال کرنا چاہیے؟
+
+ 1. استعمال میں آسان جدید ترین ماڈلز:
+
+ - قدرتی زبان کی سمجھ اور تخلیق، کمپیوٹر وژن، اور آڈیو کے کاموں میں اعلی کارکردگی۔
+ - معلمین اور عملی ماہرین کے لیے کم داخلی رکاوٹ۔
+ - سیکھنے کے لیے صرف تین کلاسز کے ساتھ چند یوزر فرینڈلی ایبسٹریکشنز۔
+ - ہمارے تمام pretrained ماڈلز کے استعمال کے لیے ایک متحد API۔
+
+ 2. کمپیوٹیشن کے اخراجات میں کمی، کاربن فٹ پرنٹ میں کمی:
+
+- محققین ہمیشہ دوبارہ تربیت کرنے کی بجائے تربیت شدہ ماڈلز شیئر کر سکتے ہیں۔
+- عملی ماہرین کمپیوٹ وقت اور پروڈکشن اخراجات کو کم کر سکتے ہیں۔
+- ہر موڈیلٹی کے لیے 400,000 سے زیادہ pretrained ماڈلز کے ساتھ درجنوں آرکیٹیکچرز۔
+
+ 3. ماڈل کے لائف ٹائم کے ہر حصے کے لیے صحیح
+فریم ورک کا انتخاب کریں:
+
+ - 3 لائنز کے کوڈ میں جدید ترین ماڈلز تربیت دیں۔
+ - ایک ماڈل کو کسی بھی وقت TF2.0/PyTorch/JAX فریم ورکس کے درمیان منتقل کریں۔
+ - تربیت، تشخیص، اور پروڈکشن کے لیے بغیر کسی رکاوٹ کے صحیح فریم ورک کا انتخاب کریں۔
+
+ 4. اپنے ضروریات کے مطابق آسانی سے ماڈل یا ایک مثال کو حسب ضرورت بنائیں:
+
+ - ہم ہر آرکیٹیکچر کے لیے مثالیں فراہم کرتے ہیں تاکہ اصل مصنفین کے شائع شدہ نتائج کو دوبارہ پیدا کیا جا سکے۔
+ - ماڈلز کی اندرونی تفصیلات کو جتنا ممکن ہو یکساں طور پر ظاہر کیا جاتا ہے۔
+ - فوری تجربات کے لیے ماڈل فائلز کو لائبریری سے آزادانہ طور پر استعمال کیا جا سکتا ہے۔
+
+## مجھے Transformers کیوں استعمال نہیں کرنا چاہیے؟
+
+- یہ لائبریری نیورل نیٹس کے لیے بلڈنگ بلاکس کا ماڈیولر ٹول باکس نہیں ہے۔ ماڈل فائلز میں موجود کوڈ جان بوجھ کر اضافی ایبسٹریکشنز کے ساتھ دوبارہ ترتیب نہیں دیا گیا ہے، تاکہ محققین بغیر اضافی ایبسٹریکشنز/فائلوں میں گئے ہوئے جلدی سے ہر ماڈل پر کام کر سکیں۔
+- تربیتی API کا مقصد کسی بھی ماڈل پر کام کرنے کے لیے نہیں ہے بلکہ یہ لائبریری کے فراہم کردہ ماڈلز کے ساتھ کام کرنے کے لیے بہتر بنایا گیا ہے۔ عام مشین لرننگ لوپس کے لیے، آپ کو دوسری لائبریری (ممکنہ طور پر [Accelerate](https://huggingface.co/docs/accelerate)) استعمال کرنی چاہیے۔
+- حالانکہ ہم جتنا ممکن ہو زیادہ سے زیادہ استعمال کے کیسز پیش کرنے کی کوشش کرتے ہیں، ہمارے [مثالوں کے فولڈر](https://github.com/huggingface/transformers/tree/main/examples) میں موجود اسکرپٹس صرف یہی ہیں: مثالیں۔ یہ توقع کی جاتی ہے کہ یہ آپ کے مخصوص مسئلے پر فوراً کام نہیں کریں گی اور آپ کو اپنی ضروریات کے مطابق کوڈ کی کچھ لائنیں تبدیل کرنی پڑیں گی۔
+
+### انسٹالیشن
+
+#### pip کے ساتھ
+
+یہ ریپوزٹری Python 3.8+، Flax 0.4.1+، PyTorch 1.11+، اور TensorFlow 2.6+ پر ٹیسٹ کی گئی ہے۔
+
+آپ کو 🤗 Transformers کو ایک [ورچوئل ماحول](https://docs.python.org/3/library/venv.html) میں انسٹال کرنا چاہیے۔ اگر آپ Python ورچوئل ماحول سے واقف نہیں ہیں، تو [یوزر گائیڈ](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) دیکھیں۔
+
+پہلے، Python کے اس ورژن کے ساتھ ایک ورچوئل ماحول بنائیں جو آپ استعمال کر رہے ہیں اور اسے ایکٹیویٹ کریں۔
+
+پھر، آپ کو کم از کم Flax، PyTorch، یا TensorFlow میں سے کسی ایک کو انسٹال کرنے کی ضرورت ہوگی۔
+براہ کرم اپنے پلیٹ فارم کے لیے مخصوص انسٹالیشن کمانڈ کے حوالے سے [TensorFlow انسٹالیشن صفحہ](https://www.tensorflow.org/install/)، [PyTorch انسٹالیشن صفحہ](https://pytorch.org/get-started/locally/#start-locally) اور/یا [Flax](https://github.com/google/flax#quick-install) اور [Jax](https://github.com/google/jax#installation) انسٹالیشن صفحات دیکھیں۔
+
+جب ان میں سے کوئی ایک بیک اینڈ انسٹال ہو جائے، تو 🤗 Transformers کو pip کے ذریعے مندرجہ ذیل طریقے سے انسٹال کیا جا سکتا ہے:
+
+```bash
+pip install transformers
+```
+
+اگر آپ مثالوں کے ساتھ کھیلنا چاہتے ہیں یا آپ کو کوڈ کا تازہ ترین ورژن چاہیے اور آپ نئے ریلیز کا انتظار نہیں کر سکتے، تو آپ کو [سورس سے لائبریری انسٹال کرنی ہوگی](https://huggingface.co/docs/transformers/installation#installing-from-source)۔
+
+#### conda کے ساتھ
+
+🤗 Transformers کو conda کے ذریعے مندرجہ ذیل طریقے سے انسٹال کیا جا سکتا ہے:
+
+```shell script
+conda install conda-forge::transformers
+```
+
+> **_نوٹ:_** `transformers` کو `huggingface` چینل سے انسٹال کرنا اب ختم کیا جا چکا ہے۔
+
+Flax، PyTorch، یا TensorFlow کو conda کے ساتھ انسٹال کرنے کے لیے انسٹالیشن صفحات کی پیروی کریں۔
+
+> **_نوٹ:_** ونڈوز پر، آپ کو کیشنگ سے فائدہ اٹھانے کے لیے ڈویلپر موڈ کو ایکٹیویٹ کرنے کا پیغام دیا جا سکتا ہے۔ اگر یہ آپ کے لیے ممکن نہیں ہے، تو براہ کرم ہمیں [اس مسئلے](https://github.com/huggingface/huggingface_hub/issues/1062) میں بتائیں۔
+
+### ماڈل کی تعمیرات
+
+ 🤗 Transformers کی طرف سے فراہم کردہ **[تمام ماڈل چیک پوائنٹس](https://huggingface.co/models)** ہگنگ فیس کے ماڈل حب [model hub](https://huggingface.co/models) سے بآسانی مربوط ہیں، جہاں یہ براہ راست [صارفین](https://huggingface.co/users) اور [تنظیموں](https://huggingface.co/organizations) کے ذریعہ اپ لوڈ کیے جاتے ہیں۔
+
+چیک پوائنٹس کی موجودہ تعداد: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+🤗 Transformers فی الحال درج ذیل معماریاں فراہم کرتا ہے: ہر ایک کا اعلی سطحی خلاصہ دیکھنے کے لیے [یہاں](https://huggingface.co/docs/transformers/model_summary) دیکھیں۔
+
+یہ چیک کرنے کے لیے کہ ہر ماڈل کی Flax، PyTorch یا TensorFlow میں کوئی عملداری ہے یا 🤗 Tokenizers لائبریری کے ذریعہ سپورٹ کردہ ٹوکنائزر کے ساتھ ہے، [اس جدول](https://huggingface.co/docs/transformers/index#supported-frameworks) کا حوالہ لیں۔
+
+یہ عملداری مختلف ڈیٹا سیٹس پر ٹیسٹ کی گئی ہیں (مثال کے اسکرپٹس دیکھیں) اور اصل عملداری کی کارکردگی کے ہم آہنگ ہونی چاہئیں۔ آپ کو کارکردگی کی مزید تفصیلات [دستاویزات](https://github.com/huggingface/transformers/tree/main/examples) کے مثالوں کے سیکشن میں مل سکتی ہیں۔
+
+
+## مزید معلومات حاصل کریں
+
+| سیکشن | تفصیل |
+|-|-|
+| [دستاویزات](https://huggingface.co/docs/transformers/) | مکمل API دستاویزات اور ٹیوٹوریلز |
+| [ٹاسک کا خلاصہ](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers کے ذریعہ سپورٹ کردہ ٹاسک |
+| [پری پروسیسنگ ٹیوٹوریل](https://huggingface.co/docs/transformers/preprocessing) | ماڈلز کے لیے ڈیٹا تیار کرنے کے لیے `Tokenizer` کلاس کا استعمال |
+| [ٹریننگ اور فائن ٹیوننگ](https://huggingface.co/docs/transformers/training) | PyTorch/TensorFlow ٹریننگ لوپ میں 🤗 Transformers کی طرف سے فراہم کردہ ماڈلز کا استعمال اور `Trainer` API |
+| [تیز دورہ: فائن ٹیوننگ/استعمال کے اسکرپٹس](https://github.com/huggingface/transformers/tree/main/examples) | مختلف قسم کے ٹاسک پر ماڈلز کو فائن ٹیون کرنے کے لیے مثال کے اسکرپٹس |
+| [ماڈل کا اشتراک اور اپ لوڈ کرنا](https://huggingface.co/docs/transformers/model_sharing) | اپنی فائن ٹیون کردہ ماڈلز کو کمیونٹی کے ساتھ اپ لوڈ اور شیئر کریں |
+
+## استشہاد
+
+ہم نے اب ایک [تحقیقی مقالہ](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) تیار کیا ہے جسے آپ 🤗 Transformers لائبریری کے لیے حوالہ دے سکتے ہیں:
+
+```bibtex
+@inproceedings{wolf-etal-2020-transformers،
+ title = "Transformers: State-of-the-Art Natural Language Processing"،
+ author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R{\'e}mi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush"،
+ booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations"،
+ month = oct،
+ year = "2020"،
+ address = "Online"،
+ publisher = "Association for Computational Linguistics"،
+ url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6"،
+ pages = "38--45"
+}
+```
diff --git a/i18n/README_vi.md b/i18n/README_vi.md
index f85dda3e215d25..5e5c2ab1e25cf7 100644
--- a/i18n/README_vi.md
+++ b/i18n/README_vi.md
@@ -49,6 +49,7 @@ limitations under the License.
Deutsch |
Tiếng việt |
العربية |
+ اردو |
diff --git a/i18n/README_zh-hans.md b/i18n/README_zh-hans.md
index f857f50d1a55c9..61f3a19849ff55 100644
--- a/i18n/README_zh-hans.md
+++ b/i18n/README_zh-hans.md
@@ -69,6 +69,7 @@ checkpoint: 检查点
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
diff --git a/i18n/README_zh-hant.md b/i18n/README_zh-hant.md
index 721e6575dec721..e20798a2d4571f 100644
--- a/i18n/README_zh-hant.md
+++ b/i18n/README_zh-hant.md
@@ -81,6 +81,7 @@ user: 使用者
Deutsch |
Tiếng Việt |
العربية |
+ اردو |
From 4f1e9bae4e0a2d94e8a347964569dd1df385de55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ziy=C3=BA=20Ye?=
Date: Wed, 18 Sep 2024 07:23:05 -0700
Subject: [PATCH 37/67] fix the wandb logging issue (#33464)
* fix the wandb logging issue
* handle ConfigError in WandbCallback; move import to local scope
* update integration_utils.py; move import of ConfigError
* Update integration_utils.py: remove trailing whitespace
---
src/transformers/integrations/integration_utils.py | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 9172f9599f77b0..40298f9c6fc77b 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -803,6 +803,10 @@ def setup(self, args, state, model, **kwargs):
if self._wandb is None:
return
self._initialized = True
+
+ # prepare to handle potential configuration issues during setup
+ from wandb.sdk.lib.config_util import ConfigError as WandbConfigError
+
if state.is_world_process_zero:
logger.info(
'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"'
@@ -852,7 +856,13 @@ def setup(self, args, state, model, **kwargs):
try:
self._wandb.config["model/num_parameters"] = model.num_parameters()
except AttributeError:
- logger.info("Could not log the number of model parameters in Weights & Biases.")
+ logger.info(
+ "Could not log the number of model parameters in Weights & Biases due to an AttributeError."
+ )
+ except WandbConfigError:
+ logger.warning(
+ "A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config."
+ )
# log the initial model architecture to an artifact
if self._log_model.is_enabled:
From f883827c0a0832b9dd53ede18aa7fffe74a1fec2 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Wed, 18 Sep 2024 16:25:45 +0200
Subject: [PATCH 38/67] Fix tests in ASR pipeline (#33545)
---
..._pipelines_automatic_speech_recognition.py | 74 +++++++++----------
1 file changed, 35 insertions(+), 39 deletions(-)
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index abb07d831ad003..842933d2b76c94 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -295,8 +295,8 @@ def test_torch_large(self):
self.assertEqual(output, {"text": ""})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
@require_torch
@@ -312,8 +312,8 @@ def test_torch_large_with_input_features(self):
self.assertEqual(output, {"text": ""})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
@slow
@@ -542,11 +542,11 @@ def test_torch_whisper(self):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
- output = speech_recognizer([filename], chunk_length_s=5, batch_size=4)
+ output = speech_recognizer([ds[40]["audio"]], chunk_length_s=5, batch_size=4)
self.assertEqual(output, [{"text": " A man said to the universe, Sir, I exist."}])
@require_torch
@@ -1014,8 +1014,8 @@ def test_torch_speech_encoder_decoder(self):
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'})
@slow
@@ -1032,13 +1032,11 @@ def test_simple_wav2vec2(self):
self.assertEqual(output, {"text": ""})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = asr(filename)
+ audio = ds[40]["audio"]
+ output = asr(audio)
self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
- filename = ds[40]["file"]
- with open(filename, "rb") as f:
- data = f.read()
+ data = Audio().encode_example(ds[40]["audio"])["bytes"]
output = asr(data)
self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
@@ -1058,13 +1056,11 @@ def test_simple_s2t(self):
self.assertEqual(output, {"text": "(Applausi)"})
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = asr(filename)
+ audio = ds[40]["audio"]
+ output = asr(audio)
self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
- filename = ds[40]["file"]
- with open(filename, "rb") as f:
- data = f.read()
+ data = Audio().encode_example(ds[40]["audio"])["bytes"]
output = asr(data)
self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
@@ -1078,13 +1074,13 @@ def test_simple_whisper_asr(self):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
- filename = ds[0]["file"]
- output = speech_recognizer(filename)
+ audio = ds[0]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(
output,
{"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
)
- output = speech_recognizer(filename, return_timestamps=True)
+ output = speech_recognizer(ds[0]["audio"], return_timestamps=True)
self.assertEqual(
output,
{
@@ -1100,7 +1096,7 @@ def test_simple_whisper_asr(self):
},
)
speech_recognizer.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
- output = speech_recognizer(filename, return_timestamps="word")
+ output = speech_recognizer(ds[0]["audio"], return_timestamps="word")
# fmt: off
self.assertEqual(
output,
@@ -1135,7 +1131,7 @@ def test_simple_whisper_asr(self):
"^Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
"Use `return_timestamps='word'` or `return_timestamps=True` respectively.$",
):
- _ = speech_recognizer(filename, return_timestamps="char")
+ _ = speech_recognizer(audio, return_timestamps="char")
@slow
@require_torch
@@ -1147,8 +1143,8 @@ def test_simple_whisper_translation(self):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
@@ -1158,7 +1154,7 @@ def test_simple_whisper_translation(self):
speech_recognizer_2 = AutomaticSpeechRecognitionPipeline(
model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
)
- output_2 = speech_recognizer_2(filename)
+ output_2 = speech_recognizer_2(ds[0]["audio"])
self.assertEqual(output, output_2)
# either use generate_kwargs or set the model's generation_config
@@ -1170,7 +1166,7 @@ def test_simple_whisper_translation(self):
feature_extractor=feature_extractor,
generate_kwargs={"task": "transcribe", "language": "<|it|>"},
)
- output_3 = speech_translator(filename)
+ output_3 = speech_translator(ds[0]["audio"])
self.assertEqual(output_3, {"text": " Un uomo ha detto all'universo, Sir, esiste."})
@slow
@@ -1182,10 +1178,10 @@ def test_whisper_language(self):
framework="pt",
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
- filename = ds[0]["file"]
+ audio = ds[0]["audio"]
# 1. English-only model compatible with no language argument
- output = speech_recognizer(filename)
+ output = speech_recognizer(audio)
self.assertEqual(
output,
{"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
@@ -1197,7 +1193,7 @@ def test_whisper_language(self):
"Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, "
"pass `is_multilingual=True` to generate, or update the generation config.",
):
- _ = speech_recognizer(filename, generate_kwargs={"language": "en"})
+ _ = speech_recognizer(ds[0]["audio"], generate_kwargs={"language": "en"})
# 3. Multilingual model accepts language argument
speech_recognizer = pipeline(
@@ -1205,7 +1201,7 @@ def test_whisper_language(self):
model="openai/whisper-tiny",
framework="pt",
)
- output = speech_recognizer(filename, generate_kwargs={"language": "en"})
+ output = speech_recognizer(ds[0]["audio"], generate_kwargs={"language": "en"})
self.assertEqual(
output,
{"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."},
@@ -1315,8 +1311,8 @@ def test_xls_r_to_en(self):
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."})
@slow
@@ -1331,8 +1327,8 @@ def test_xls_r_from_en(self):
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."})
@slow
@@ -1348,9 +1344,8 @@ def test_speech_to_text_leveraged(self):
)
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
- filename = ds[40]["file"]
-
- output = speech_recognizer(filename)
+ audio = ds[40]["audio"]
+ output = speech_recognizer(audio)
self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
@slow
@@ -1561,6 +1556,7 @@ def test_whisper_longform(self):
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
device=torch_device,
+ return_timestamps=True, # to allow longform generation
)
ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
From fc83a4d45921150b4c23b68d08ac9ee946070149 Mon Sep 17 00:00:00 2001
From: Umar Butler
Date: Thu, 19 Sep 2024 00:41:50 +1000
Subject: [PATCH 39/67] Added support for bfloat16 to zero-shot classification
pipeline (#33554)
* Added support for bfloat16 to zero-shot classification pipeline
* Ensure support for TF.
Co-authored-by: Matt
* Remove dependency on `torch`.
Co-authored-by: Matt
---------
Co-authored-by: Matt
---
src/transformers/pipelines/zero_shot_classification.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py
index 9a600bc8ad0fb8..f4aee3341e30d5 100644
--- a/src/transformers/pipelines/zero_shot_classification.py
+++ b/src/transformers/pipelines/zero_shot_classification.py
@@ -239,7 +239,10 @@ def _forward(self, inputs):
def postprocess(self, model_outputs, multi_label=False):
candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
sequences = [outputs["sequence"] for outputs in model_outputs]
- logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
+ if self.framework == "pt":
+ logits = np.concatenate([output["logits"].float().numpy() for output in model_outputs])
+ else:
+ logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
N = logits.shape[0]
n = len(candidate_labels)
num_sequences = N // n
From 7542fac2c7e5e5761fb0394b045a8e4d9168da1c Mon Sep 17 00:00:00 2001
From: Joao Gante
Date: Wed, 18 Sep 2024 15:43:06 +0100
Subject: [PATCH 40/67] =?UTF-8?q?Pipeline:=20no=20side-effects=20on=20`mod?=
=?UTF-8?q?el.config`=20and=20`model.generation=5Fconfig`=20=F0=9F=94=AB?=
=?UTF-8?q?=20=20(#33480)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../generation/configuration_utils.py | 4 ++
src/transformers/generation/utils.py | 13 ++++---
.../pipelines/automatic_speech_recognition.py | 4 ++
src/transformers/pipelines/base.py | 37 +++++++++++--------
.../pipelines/document_question_answering.py | 4 ++
src/transformers/pipelines/image_to_text.py | 4 ++
.../pipelines/table_question_answering.py | 4 ++
.../pipelines/text2text_generation.py | 11 ++++--
src/transformers/pipelines/text_generation.py | 13 ++++---
src/transformers/pipelines/text_to_audio.py | 6 ++-
.../pipelines/visual_question_answering.py | 4 ++
tests/pipelines/test_pipelines_common.py | 26 +++++++++++++
tests/utils/test_modeling_utils.py | 32 ++++++++++++++++
13 files changed, 132 insertions(+), 30 deletions(-)
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index e2585b1b9ed49c..5e9ac835c19d6d 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -1229,6 +1229,10 @@ def from_model_config(cls, model_config: PretrainedConfig) -> "GenerationConfig"
"""
config_dict = model_config.to_dict()
config_dict.pop("_from_model_config", None)
+
+ # Removes all `None` from the model config dict -- this lets the generation config defaults to take hold
+ config_dict = {key: value for key, value in config_dict.items() if value is not None}
+
generation_config = cls.from_dict(config_dict, return_unused_kwargs=False, _from_model_config=True)
# Special case: some models have generation attributes set in the decoder. Use them if still unset in the
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 019eb6c27f18cc..d8896f91267d7b 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1334,23 +1334,26 @@ def _prepare_generation_config(
# the following conditions must be met
# 1) the generation config must have been created from the model config (`_from_model_config` field);
# 2) the generation config must have seen no modification since its creation (the hash is the same);
- # 3) the user must have set generation parameters in the model config.
+ # 3) there are non-default generation parameters in the model config.
+ # 4) the user must have set new generation parameters in the model config.
# NOTE: `torch.compile` can't compile `hash`, this legacy support is disabled with compilation.
if (
not is_torchdynamo_compiling()
and self.generation_config._from_model_config # 1)
and self.generation_config._original_object_hash == hash(self.generation_config) # 2)
+ and len(self.config._get_non_default_generation_parameters()) > 0 # 3)
):
new_generation_config = GenerationConfig.from_model_config(self.config)
- if new_generation_config != self.generation_config: # 3)
+ if new_generation_config != self.generation_config: # 4)
warnings.warn(
"You have modified the pretrained model configuration to control generation. This is a"
- " deprecated strategy to control generation and will be removed soon, in a future version."
+ " deprecated strategy to control generation and will be removed in v5."
" Please use and modify the model generation configuration (see"
- " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
+ " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
+ UserWarning,
)
self.generation_config = new_generation_config
- using_model_generation_config = True
+
generation_config = self.generation_config
using_model_generation_config = True
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index f3de341d88954c..7c122bed5437cc 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -501,6 +501,10 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
else:
generate_kwargs["num_frames"] = num_frames
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
tokens = self.model.generate(
inputs=inputs,
attention_mask=attention_mask,
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 7db33ab5bd1a01..40a91a0d484b8e 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
+import copy
import csv
import importlib
import json
@@ -899,22 +900,26 @@ def __init__(
):
self.model.to(self.device)
- # Update config and generation_config with task specific parameters
- task_specific_params = self.model.config.task_specific_params
- if task_specific_params is not None and task in task_specific_params:
- self.model.config.update(task_specific_params.get(task))
- if self.model.can_generate():
- self.model.generation_config.update(**task_specific_params.get(task))
-
- # Pipelines calling `generate`: if the tokenizer has a pad token but the model doesn't, set it in the
- # forward params so that `generate` is aware of the pad token.
- if (
- self.tokenizer is not None
- and self.model.can_generate()
- and self.tokenizer.pad_token_id is not None
- and self.model.generation_config.pad_token_id is None
- ):
- self.model.generation_config.pad_token_id = self.tokenizer.pad_token_id
+ # If the model can generate, create a local generation config. This is done to avoid side-effects on the model
+ # as we apply local tweaks to the generation config.
+ if self.model.can_generate():
+ self.prefix = self.model.config.prefix if hasattr(self.model.config, "prefix") else None
+ self.generation_config = copy.deepcopy(self.model.generation_config)
+ # Update the generation config with task specific params if they exist
+ # NOTE: `prefix` is pipeline-specific and doesn't exist in the generation config.
+ task_specific_params = self.model.config.task_specific_params
+ if task_specific_params is not None and task in task_specific_params:
+ this_task_params = task_specific_params.get(task)
+ if "prefix" in this_task_params:
+ self.prefix = this_task_params.pop("prefix")
+ self.generation_config.update(**this_task_params)
+ # If the tokenizer has a pad token but the model doesn't, set it so that `generate` is aware of it.
+ if (
+ self.tokenizer is not None
+ and self.tokenizer.pad_token_id is not None
+ and self.generation_config.pad_token_id is None
+ ):
+ self.generation_config.pad_token_id = self.tokenizer.pad_token_id
self.call_count = 0
self._batch_size = kwargs.pop("batch_size", None)
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index aa4fb48aae6a40..9198f432263822 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -429,6 +429,10 @@ def _forward(self, model_inputs, **generate_kwargs):
is_last = model_inputs.pop("is_last", False)
if self.model_type == ModelType.VisionEncoderDecoder:
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
else:
model_outputs = self.model(**model_inputs)
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 88dce8e591ae41..91d44c46d25c10 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -181,6 +181,10 @@ def _forward(self, model_inputs, **generate_kwargs):
):
model_inputs["input_ids"] = None
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
# FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py`
# parse inputs. In the Tensorflow version, `generate` raises an error if we don't use `input_ids` whereas
# the PyTorch version matches it with `self.model.main_input_name` or `self.model.encoder.main_input_name`
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index 702a47b7c3cbed..77c95432c7218f 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -385,6 +385,10 @@ def _forward(self, model_inputs, sequential=False, **generate_kwargs):
else:
outputs = self.batch_inference(**model_inputs)
else:
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
outputs = self.model.generate(**model_inputs, **generate_kwargs)
model_outputs = {"model_inputs": model_inputs, "table": table, "outputs": outputs}
return model_outputs
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index 42d97f4d11b919..75ded8ac085ca5 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -115,7 +115,7 @@ def check_inputs(self, input_length: int, min_length: int, max_length: int):
return True
def _parse_and_tokenize(self, *args, truncation):
- prefix = self.model.config.prefix if self.model.config.prefix is not None else ""
+ prefix = self.prefix if self.prefix is not None else ""
if isinstance(args[0], list):
if self.tokenizer.pad_token_id is None:
raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input")
@@ -185,9 +185,14 @@ def _forward(self, model_inputs, **generate_kwargs):
self.check_inputs(
input_length,
- generate_kwargs.get("min_length", self.model.config.min_length),
- generate_kwargs.get("max_length", self.model.config.max_length),
+ generate_kwargs.get("min_length", self.generation_config.min_length),
+ generate_kwargs.get("max_length", self.generation_config.max_length),
)
+
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
output_ids = self.model.generate(**model_inputs, **generate_kwargs)
out_b = output_ids.shape[0]
if self.framework == "pt":
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index 8bd1017ffc6696..9bffca522d5f2e 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -103,8 +103,8 @@ def __init__(self, *args, **kwargs):
# It also defines both some preprocess_kwargs and generate_kwargs
# which is why we cannot put them in their respective methods.
prefix = None
- if self.model.config.prefix is not None:
- prefix = self.model.config.prefix
+ if self.prefix is not None:
+ prefix = self.prefix
if prefix is None and self.model.__class__.__name__ in [
"XLNetLMHeadModel",
"TransfoXLLMHeadModel",
@@ -316,7 +316,7 @@ def preprocess(
if "max_new_tokens" in generate_kwargs:
new_tokens = generate_kwargs["max_new_tokens"]
else:
- new_tokens = generate_kwargs.get("max_length", self.model.config.max_length) - cur_len
+ new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len
if new_tokens < 0:
raise ValueError("We cannot infer how many new tokens are expected")
if cur_len + new_tokens > self.tokenizer.model_max_length:
@@ -354,7 +354,7 @@ def _forward(self, model_inputs, **generate_kwargs):
and generate_kwargs["generation_config"].max_new_tokens is not None
)
if not has_max_new_tokens:
- generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.model.config.max_length
+ generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length
generate_kwargs["max_length"] += prefix_length
has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
"generation_config" in generate_kwargs
@@ -363,7 +363,10 @@ def _forward(self, model_inputs, **generate_kwargs):
if not has_min_new_tokens and "min_length" in generate_kwargs:
generate_kwargs["min_length"] += prefix_length
- # BS x SL
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
out_b = generated_sequence.shape[0]
if self.framework == "pt":
diff --git a/src/transformers/pipelines/text_to_audio.py b/src/transformers/pipelines/text_to_audio.py
index 81653f14d6d878..d17d18205920b0 100644
--- a/src/transformers/pipelines/text_to_audio.py
+++ b/src/transformers/pipelines/text_to_audio.py
@@ -111,7 +111,7 @@ def preprocess(self, text, **kwargs):
if self.model.config.model_type == "bark":
# bark Tokenizer is called with BarkProcessor which uses those kwargs
new_kwargs = {
- "max_length": self.model.generation_config.semantic_config.get("max_input_semantic_length", 256),
+ "max_length": self.generation_config.semantic_config.get("max_input_semantic_length", 256),
"add_special_tokens": False,
"return_attention_mask": True,
"return_token_type_ids": False,
@@ -137,6 +137,10 @@ def _forward(self, model_inputs, **kwargs):
# we expect some kwargs to be additional tensors which need to be on the right device
generate_kwargs = self._ensure_tensor_on_device(generate_kwargs, device=self.device)
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
# generate_kwargs get priority over forward_params
forward_params.update(generate_kwargs)
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index e5849cbdec1955..89988c0cba2b1b 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -162,6 +162,10 @@ def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
def _forward(self, model_inputs, **generate_kwargs):
if self.model.can_generate():
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
else:
model_outputs = self.model(**model_inputs)
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index ea36ae5728d161..1fec4be3d95ca0 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -31,6 +31,7 @@
AutoTokenizer,
DistilBertForSequenceClassification,
MaskGenerationPipeline,
+ T5ForConditionalGeneration,
TextClassificationPipeline,
TextGenerationPipeline,
TFAutoModelForSequenceClassification,
@@ -234,6 +235,31 @@ def test_auto_model_pipeline_registration_from_local_dir(self):
self.assertIsInstance(pipe, TextGenerationPipeline) # Assert successful load
+ @require_torch
+ def test_pipeline_with_task_parameters_no_side_effects(self):
+ """
+ Regression test: certain pipeline flags, like `task`, modified the model configuration, causing unexpected
+ side-effects
+ """
+ # This checkpoint has task-specific parameters that will modify the behavior of the pipeline
+ model = T5ForConditionalGeneration.from_pretrained("t5-small")
+ self.assertTrue(model.config.num_beams == 1)
+
+ # The task-specific parameters used to cause side-effects on `model.config` -- not anymore
+ pipe = pipeline(model=model, tokenizer=AutoTokenizer.from_pretrained("t5-small"), task="translation_en_to_de")
+ self.assertTrue(model.config.num_beams == 1)
+ self.assertTrue(model.generation_config.num_beams == 1)
+
+ # Under the hood: we now store a generation config in the pipeline. This generation config stores the
+ # task-specific paremeters.
+ self.assertTrue(pipe.generation_config.num_beams == 4)
+
+ # We can confirm that the task-specific parameters have an effect. (In this case, the default is `num_beams=1`,
+ # which would crash when `num_return_sequences=4` is passed.)
+ pipe("Hugging Face doesn't sell hugs.", num_return_sequences=4)
+ with self.assertRaises(ValueError):
+ pipe("Hugging Face doesn't sell hugs.", num_return_sequences=4, num_beams=1)
+
@is_pipeline_test
class PipelineScikitCompatTest(unittest.TestCase):
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index f78285fdb90d90..2130ed4b7c887f 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -1715,6 +1715,38 @@ def test_isin_mps_friendly(self):
torch.equal(torch.isin(random_ids, random_test_tensor), isin_mps_friendly(random_ids, random_test_tensor))
)
+ def test_save_and_load_config_with_custom_generation(self):
+ """
+ Regression test for the ability to save and load a config with a custom generation kwarg (i.e. a parameter
+ that gets moved to the generation config and reset on the model config)
+ """
+ model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+
+ # The default for `num_beams` is 1 and `early_stopping` is False
+ self.assertTrue(model.config.num_beams == 1)
+ self.assertTrue(model.config.early_stopping is False)
+
+ # When we save the model, this custom parameter should be moved to the generation config AND the model
+ # config should contain `None`
+ model.config.num_beams = 2
+ model.config.early_stopping = True
+ self.assertTrue(model.generation_config.num_beams == 1) # unmodified generation config
+ with tempfile.TemporaryDirectory() as tmp_dir:
+ model.save_pretrained(tmp_dir)
+ new_model = T5ForConditionalGeneration.from_pretrained(tmp_dir)
+ # moved to generation config
+ self.assertTrue(new_model.generation_config.num_beams == 2)
+ self.assertTrue(new_model.generation_config.early_stopping is True)
+ # reset in the model config
+ self.assertTrue(new_model.config.num_beams is None)
+ self.assertTrue(new_model.config.early_stopping is None)
+
+ # Sanity check: We can run `generate` with the new model without any warnings
+ random_ids = torch.randint(0, 100, (1, 5))
+ with warnings.catch_warnings(record=True) as w:
+ new_model.generate(random_ids, max_new_tokens=3)
+ self.assertTrue(len(w) == 0)
+
@slow
@require_torch
From 8efc06ee1863bd6e34e8adb7b10901da87c66818 Mon Sep 17 00:00:00 2001
From: Matt
Date: Wed, 18 Sep 2024 15:57:39 +0100
Subject: [PATCH 41/67] Return attention mask in ASR pipeline to avoid warnings
(#33509)
return attention mask in ASR pipeline
---
.../pipelines/automatic_speech_recognition.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 7c122bed5437cc..4301982f1e901c 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -440,6 +440,7 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
truncation=False,
padding="longest",
return_tensors="pt",
+ return_attention_mask=True,
)
else:
if self.type == "seq2seq_whisper" and stride is None:
@@ -448,13 +449,16 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
sampling_rate=self.feature_extractor.sampling_rate,
return_tensors="pt",
return_token_timestamps=True,
+ return_attention_mask=True,
)
extra["num_frames"] = processed.pop("num_frames")
else:
processed = self.feature_extractor(
- inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+ inputs,
+ sampling_rate=self.feature_extractor.sampling_rate,
+ return_tensors="pt",
+ return_attention_mask=True,
)
-
if self.torch_dtype is not None:
processed = processed.to(dtype=self.torch_dtype)
if stride is not None:
From 9db963aeed419c8379c6d6425186fec0bfb86908 Mon Sep 17 00:00:00 2001
From: Dominik Niedziela <99881522+dom-dziela@users.noreply.github.com>
Date: Wed, 18 Sep 2024 17:38:31 +0200
Subject: [PATCH 42/67] enforce original size to be a list (#33564)
* enforce original size to be a list
* formatting
* apply datatype change to unpad_image in llava_next
---
src/transformers/models/llava_next/modeling_llava_next.py | 6 ++++++
.../models/llava_onevision/modeling_llava_onevision.py | 6 ++++++
2 files changed, 12 insertions(+)
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index c1d1ca8c276d7a..bf76921090b244 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -123,6 +123,12 @@ def unpad_image(tensor, original_size):
Returns:
`torch.Tensor`: The unpadded image tensor.
"""
+ if not isinstance(original_size, (list, tuple)):
+ if not isinstance(original_size, (torch.Tensor, np.ndarray)):
+ raise TypeError(
+ f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+ )
+ original_size = original_size.tolist()
original_height, original_width = original_size
current_height, current_width = tensor.shape[1:]
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index 697ea84fea5040..d3200fb5193d4b 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -124,6 +124,12 @@ def unpad_image(tensor, original_size):
Returns:
`torch.Tensor`: The unpadded image tensor.
"""
+ if not isinstance(original_size, (list, tuple)):
+ if not isinstance(original_size, (torch.Tensor, np.ndarray)):
+ raise TypeError(
+ f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+ )
+ original_size = original_size.tolist()
original_height, original_width = original_size
current_height, current_width = tensor.shape[1:]
From 7b1ce634cb16f86725826e427bf30f1276cc0e19 Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Wed, 18 Sep 2024 12:56:45 -0400
Subject: [PATCH 43/67] Improve compiled RT-DETR inference speed (#33412)
* modify rt detr to improve inference times when compiled
* Remove redundant "to"
* Fix conditional lru_cache and missing shapes_list
* nit unnecessary list creation
* Fix compile error when ninja not available and custon kernel activated
---
.../models/rt_detr/modeling_rt_detr.py | 86 ++++++++++++++-----
1 file changed, 64 insertions(+), 22 deletions(-)
diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py
index ab83a81f50674d..4e32434901cdc7 100644
--- a/src/transformers/models/rt_detr/modeling_rt_detr.py
+++ b/src/transformers/models/rt_detr/modeling_rt_detr.py
@@ -18,7 +18,7 @@
import os
import warnings
from dataclasses import dataclass
-from functools import lru_cache, partial
+from functools import lru_cache, partial, wraps
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
@@ -737,7 +737,9 @@ def multi_scale_deformable_attention(
) -> Tensor:
batch_size, _, num_heads, hidden_dim = value.shape
_, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
- value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+ # Ignore copy
+ value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
+
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for level_id, (height, width) in enumerate(value_spatial_shapes):
@@ -849,6 +851,7 @@ def forward(
position_embeddings: Optional[torch.Tensor] = None,
reference_points=None,
spatial_shapes=None,
+ spatial_shapes_list=None,
level_start_index=None,
output_attentions: bool = False,
):
@@ -858,7 +861,10 @@ def forward(
batch_size, num_queries, _ = hidden_states.shape
batch_size, sequence_length, _ = encoder_hidden_states.shape
- if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+
+ # Ignore copy
+ total_elements = sum(shape[0] * shape[1] for shape in spatial_shapes_list)
+ if total_elements != sequence_length:
raise ValueError(
"Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
)
@@ -893,9 +899,12 @@ def forward(
else:
raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
- if self.disable_custom_kernels:
+ # Ignore copy
+ if self.disable_custom_kernels or MultiScaleDeformableAttention is None:
# PyTorch implementation
- output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+ output = multi_scale_deformable_attention(
+ value, spatial_shapes_list, sampling_locations, attention_weights
+ )
else:
try:
# custom kernel
@@ -909,7 +918,9 @@ def forward(
)
except Exception:
# PyTorch implementation
- output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+ output = multi_scale_deformable_attention(
+ value, spatial_shapes_list, sampling_locations, attention_weights
+ )
output = self.output_proj(output)
return output, attention_weights
@@ -1064,6 +1075,7 @@ def forward(
position_embeddings: Optional[torch.Tensor] = None,
reference_points=None,
spatial_shapes=None,
+ spatial_shapes_list=None,
level_start_index=None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
@@ -1114,6 +1126,7 @@ def forward(
position_embeddings=position_embeddings,
reference_points=reference_points,
spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
level_start_index=level_start_index,
output_attentions=output_attentions,
)
@@ -1299,14 +1312,16 @@ def __init__(self, config: RTDetrConfig):
self.pan_blocks.append(RTDetrCSPRepLayer(config))
@staticmethod
- def build_2d_sincos_position_embedding(width, height, embed_dim=256, temperature=10000.0):
- grid_w = torch.arange(int(width), dtype=torch.float32)
- grid_h = torch.arange(int(height), dtype=torch.float32)
+ def build_2d_sincos_position_embedding(
+ width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
+ ):
+ grid_w = torch.arange(int(width), dtype=dtype, device=device)
+ grid_h = torch.arange(int(height), dtype=dtype, device=device)
grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
if embed_dim % 4 != 0:
raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding")
pos_dim = embed_dim // 4
- omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+ omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim
omega = 1.0 / (temperature**omega)
out_w = grid_w.flatten()[..., None] @ omega[None]
@@ -1372,8 +1387,13 @@ def forward(
src_flatten = hidden_states[enc_ind].flatten(2).permute(0, 2, 1)
if self.training or self.eval_size is None:
pos_embed = self.build_2d_sincos_position_embedding(
- width, height, self.encoder_hidden_dim, self.positional_encoding_temperature
- ).to(src_flatten.device, src_flatten.dtype)
+ width,
+ height,
+ self.encoder_hidden_dim,
+ self.positional_encoding_temperature,
+ device=src_flatten.device,
+ dtype=src_flatten.dtype,
+ )
else:
pos_embed = None
@@ -1441,6 +1461,7 @@ def forward(
position_embeddings=None,
reference_points=None,
spatial_shapes=None,
+ spatial_shapes_list=None,
level_start_index=None,
valid_ratios=None,
output_attentions=None,
@@ -1512,6 +1533,7 @@ def forward(
encoder_hidden_states=encoder_hidden_states,
reference_points=reference_points_input,
spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
level_start_index=level_start_index,
encoder_attention_mask=encoder_attention_mask,
output_attentions=output_attentions,
@@ -1575,6 +1597,27 @@ def forward(
)
+def compile_compatible_lru_cache(*lru_args, **lru_kwargs):
+ def decorator(func):
+ @wraps(func)
+ def wrapper(self, *args, **kwargs):
+ if not torch.compiler.is_compiling():
+ # Cache the function only if the model is not being compiled
+ # check if the function is already cached, otherwise create it
+ if not hasattr(self, f"_cached_{func.__name__}"):
+ self.__setattr__(
+ f"_cached_{func.__name__}", lru_cache(*lru_args, **lru_kwargs)(func.__get__(self))
+ )
+ return self.__getattribute__(f"_cached_{func.__name__}")(*args, **kwargs)
+ else:
+ # Otherwise, just call the original function
+ return func(self, *args, **kwargs)
+
+ return wrapper
+
+ return decorator
+
+
@add_start_docstrings(
"""
RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top.
@@ -1626,7 +1669,7 @@ def __init__(self, config: RTDetrConfig):
# init encoder output anchors and valid_mask
if config.anchor_image_size:
- self.anchors, self.valid_mask = self.generate_anchors()
+ self.anchors, self.valid_mask = self.generate_anchors(dtype=self.dtype)
# Create decoder input projection layers
# https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412
@@ -1669,12 +1712,8 @@ def unfreeze_backbone(self):
for param in self.backbone.parameters():
param.requires_grad_(True)
- @lru_cache(maxsize=32)
- def generate_anchors(self, spatial_shapes=None, grid_size=0.05):
- # We always generate anchors in float32 to preserve equivalence between
- # dynamic and static anchor inference
- dtype = torch.float32
-
+ @compile_compatible_lru_cache(maxsize=32)
+ def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32):
if spatial_shapes is None:
spatial_shapes = [
[int(self.config.anchor_image_size[0] / s), int(self.config.anchor_image_size[1] / s)]
@@ -1683,10 +1722,12 @@ def generate_anchors(self, spatial_shapes=None, grid_size=0.05):
anchors = []
for level, (height, width) in enumerate(spatial_shapes):
grid_y, grid_x = torch.meshgrid(
- torch.arange(end=height, dtype=dtype), torch.arange(end=width, dtype=dtype), indexing="ij"
+ torch.arange(end=height, dtype=dtype, device=device),
+ torch.arange(end=width, dtype=dtype, device=device),
+ indexing="ij",
)
grid_xy = torch.stack([grid_x, grid_y], -1)
- valid_wh = torch.tensor([width, height]).to(dtype)
+ valid_wh = torch.tensor([width, height], device=device).to(dtype)
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_wh
wh = torch.ones_like(grid_xy) * grid_size * (2.0**level)
anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4))
@@ -1826,7 +1867,7 @@ def forward(
# Pass spatial_shapes as tuple to make it hashable and make sure
# lru_cache is working for generate_anchors()
spatial_shapes_tuple = tuple(spatial_shapes_list)
- anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple)
+ anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype)
else:
anchors, valid_mask = self.anchors, self.valid_mask
@@ -1873,6 +1914,7 @@ def forward(
encoder_attention_mask=attention_mask,
reference_points=init_reference_points,
spatial_shapes=spatial_shapes,
+ spatial_shapes_list=spatial_shapes_list,
level_start_index=level_start_index,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
From 6019f3ff7805f94b4bec1ad5fcf8c438ecb03ee6 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Wed, 18 Sep 2024 19:10:28 +0200
Subject: [PATCH 44/67] Fix bnb dequantization (#33546)
---
src/transformers/integrations/bitsandbytes.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index c49d353ccb520b..f37ca9a2650bf3 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -437,6 +437,7 @@ def _dequantize_and_replace(
new_module.to(device)
model._modules[name] = new_module
+ has_been_replaced = True
if len(list(module.children())) > 0:
_, has_been_replaced = _dequantize_and_replace(
module,
From 5af7d41e49bbfc8319f462eb45253dcb3863dfb7 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Wed, 18 Sep 2024 19:23:44 +0200
Subject: [PATCH 45/67] Codec integration (#33565)
* clean mimi commit
* some nits suggestions from Arthur
* make fixup
* rename repo id + change readme
* Update docs/source/en/model_doc/mimi.md
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
* add flaky flag to batching equivalence due to audio_codes failing sometimes
---------
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
docs/source/en/_toctree.yml | 2 +
docs/source/en/index.md | 1 +
docs/source/en/model_doc/mimi.md | 69 +
docs/source/en/perf_infer_gpu_one.md | 2 +
src/transformers/__init__.py | 14 +
src/transformers/models/__init__.py | 1 +
.../models/auto/configuration_auto.py | 2 +
.../models/auto/feature_extraction_auto.py | 1 +
src/transformers/models/auto/modeling_auto.py | 1 +
src/transformers/models/mimi/__init__.py | 57 +
.../models/mimi/configuration_mimi.py | 234 +++
.../convert_mimi_checkpoint_to_pytorch.py | 198 ++
src/transformers/models/mimi/modeling_mimi.py | 1722 +++++++++++++++++
src/transformers/utils/dummy_pt_objects.py | 14 +
tests/models/mimi/__init__.py | 0
tests/models/mimi/test_modeling_mimi.py | 890 +++++++++
16 files changed, 3208 insertions(+)
create mode 100644 docs/source/en/model_doc/mimi.md
create mode 100644 src/transformers/models/mimi/__init__.py
create mode 100644 src/transformers/models/mimi/configuration_mimi.py
create mode 100644 src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
create mode 100644 src/transformers/models/mimi/modeling_mimi.py
create mode 100644 tests/models/mimi/__init__.py
create mode 100644 tests/models/mimi/test_modeling_mimi.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 7eff2a38302669..59f0ff48d22a75 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -722,6 +722,8 @@
title: Hubert
- local: model_doc/mctct
title: MCTCT
+ - local: model_doc/mimi
+ title: Mimi
- local: model_doc/mms
title: MMS
- local: model_doc/musicgen
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index c18426de4c031c..cc5d7990929aa4 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -210,6 +210,7 @@ Flax), PyTorch, and/or TensorFlow.
| [Megatron-BERT](model_doc/megatron-bert) | ✅ | ❌ | ❌ |
| [Megatron-GPT2](model_doc/megatron_gpt2) | ✅ | ✅ | ✅ |
| [MGP-STR](model_doc/mgp-str) | ✅ | ❌ | ❌ |
+| [Mimi](model_doc/mimi) | ✅ | ❌ | ❌ |
| [Mistral](model_doc/mistral) | ✅ | ✅ | ✅ |
| [Mixtral](model_doc/mixtral) | ✅ | ❌ | ❌ |
| [mLUKE](model_doc/mluke) | ✅ | ❌ | ❌ |
diff --git a/docs/source/en/model_doc/mimi.md b/docs/source/en/model_doc/mimi.md
new file mode 100644
index 00000000000000..486d1836334949
--- /dev/null
+++ b/docs/source/en/model_doc/mimi.md
@@ -0,0 +1,69 @@
+
+
+# Mimi
+
+## Overview
+
+The Mimi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour. Mimi is a high-fidelity audio codec model developed by the Kyutai team, that combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps. In other words, it can be used to map audio waveforms into “audio tokens”, known as “codebooks”.
+
+The abstract from the paper is the following:
+
+*We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning— such as emotion or non-speech sounds— is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this “Inner Monologue” method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at github.com/kyutai-labs/moshi.*
+
+Its architecture is based on [Encodec](model_doc/encodec) with several major differences:
+* it uses a much lower frame-rate.
+* it uses additional transformers for encoding and decoding for better latent contextualization
+* it uses a different quantization scheme: one codebook is dedicated to semantic projection.
+
+## Usage example
+
+Here is a quick example of how to encode and decode an audio using this model:
+
+```python
+>>> from datasets import load_dataset, Audio
+>>> from transformers import MimiModel, AutoFeatureExtractor
+>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+>>> # load model and feature extractor
+>>> model = MimiModel.from_pretrained("kyutai/mimi")
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")
+
+>>> # load audio sample
+>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
+>>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
+>>> inputs = feature_extractor(raw_audio=audio_sample, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
+
+>>> encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])
+>>> audio_values = model.decode(encoder_outputs.audio_codes, inputs["padding_mask"])[0]
+>>> # or the equivalent with a forward pass
+>>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
+```
+
+This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe).
+The original code can be found [here](https://github.com/kyutai-labs/moshi).
+
+
+## MimiConfig
+
+[[autodoc]] MimiConfig
+
+## MimiModel
+
+[[autodoc]] MimiModel
+ - decode
+ - encode
+ - forward
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index dd3433f2cd4862..4c220dd0f1483c 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -61,6 +61,7 @@ FlashAttention-2 is currently supported for the following architectures:
* [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
* [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video)
* [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
+* [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
* [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
* [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava)
* [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)
@@ -228,6 +229,7 @@ For now, Transformers supports SDPA inference and training for the following arc
* [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
* [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
+* [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
* [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
* [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
* [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index bfd0d37916b553..aa13a97fe46150 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -573,6 +573,7 @@
"MgpstrProcessor",
"MgpstrTokenizer",
],
+ "models.mimi": ["MimiConfig"],
"models.mistral": ["MistralConfig"],
"models.mixtral": ["MixtralConfig"],
"models.mluke": [],
@@ -2666,6 +2667,12 @@
"MgpstrPreTrainedModel",
]
)
+ _import_structure["models.mimi"].extend(
+ [
+ "MimiModel",
+ "MimiPreTrainedModel",
+ ]
+ )
_import_structure["models.mistral"].extend(
[
"MistralForCausalLM",
@@ -5345,6 +5352,9 @@
MgpstrProcessor,
MgpstrTokenizer,
)
+ from .models.mimi import (
+ MimiConfig,
+ )
from .models.mistral import MistralConfig
from .models.mixtral import MixtralConfig
from .models.mobilebert import (
@@ -7212,6 +7222,10 @@
MgpstrModel,
MgpstrPreTrainedModel,
)
+ from .models.mimi import (
+ MimiModel,
+ MimiPreTrainedModel,
+ )
from .models.mistral import (
MistralForCausalLM,
MistralForSequenceClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 2022048cd4553f..5b5d1e7902bd67 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -149,6 +149,7 @@
megatron_bert,
megatron_gpt2,
mgp_str,
+ mimi,
mistral,
mixtral,
mluke,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2cd7d550d90b7a..5a6ec14e78cd43 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -167,6 +167,7 @@
("mega", "MegaConfig"),
("megatron-bert", "MegatronBertConfig"),
("mgp-str", "MgpstrConfig"),
+ ("mimi", "MimiConfig"),
("mistral", "MistralConfig"),
("mixtral", "MixtralConfig"),
("mobilebert", "MobileBertConfig"),
@@ -468,6 +469,7 @@
("megatron-bert", "Megatron-BERT"),
("megatron_gpt2", "Megatron-GPT2"),
("mgp-str", "MGP-STR"),
+ ("mimi", "Mimi"),
("mistral", "Mistral"),
("mixtral", "Mixtral"),
("mluke", "mLUKE"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 7f335d66584f9f..dca0c08aa90957 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -69,6 +69,7 @@
("levit", "LevitFeatureExtractor"),
("maskformer", "MaskFormerFeatureExtractor"),
("mctct", "MCTCTFeatureExtractor"),
+ ("mimi", "EncodecFeatureExtractor"),
("mobilenet_v1", "MobileNetV1FeatureExtractor"),
("mobilenet_v2", "MobileNetV2FeatureExtractor"),
("mobilevit", "MobileViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index e0d15f1e236590..2bc71f07970aee 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -158,6 +158,7 @@
("mega", "MegaModel"),
("megatron-bert", "MegatronBertModel"),
("mgp-str", "MgpstrForSceneTextRecognition"),
+ ("mimi", "MimiModel"),
("mistral", "MistralModel"),
("mixtral", "MixtralModel"),
("mobilebert", "MobileBertModel"),
diff --git a/src/transformers/models/mimi/__init__.py b/src/transformers/models/mimi/__init__.py
new file mode 100644
index 00000000000000..43b2bec6caa5b3
--- /dev/null
+++ b/src/transformers/models/mimi/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+ OptionalDependencyNotAvailable,
+ _LazyModule,
+ is_torch_available,
+)
+
+
+_import_structure = {
+ "configuration_mimi": ["MimiConfig"],
+}
+
+try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ pass
+else:
+ _import_structure["modeling_mimi"] = [
+ "MimiModel",
+ "MimiPreTrainedModel",
+ ]
+
+if TYPE_CHECKING:
+ from .configuration_mimi import (
+ MimiConfig,
+ )
+
+ try:
+ if not is_torch_available():
+ raise OptionalDependencyNotAvailable()
+ except OptionalDependencyNotAvailable:
+ pass
+ else:
+ from .modeling_mimi import (
+ MimiModel,
+ MimiPreTrainedModel,
+ )
+
+else:
+ import sys
+
+ sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py
new file mode 100644
index 00000000000000..5564b1a54ba63b
--- /dev/null
+++ b/src/transformers/models/mimi/configuration_mimi.py
@@ -0,0 +1,234 @@
+# coding=utf-8
+# Copyright 2024 Meta Platforms, Inc. and affiliates, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mimi model configuration"""
+
+import math
+
+import numpy as np
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MimiConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of an [`MimiModel`]. It is used to instantiate a
+ Mimi model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the
+ [kyutai/mimi](https://huggingface.co/kyutai/mimi) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ sampling_rate (`int`, *optional*, defaults to 24000):
+ The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+ frame_rate (`float`, *optional*, defaults to 12.5):
+ Framerate of the model.
+ audio_channels (`int`, *optional*, defaults to 1):
+ Number of channels in the audio data. Either 1 for mono or 2 for stereo.
+ hidden_size (`int`, *optional*, defaults to 512):
+ Intermediate representation dimension.
+ num_filters (`int`, *optional*, defaults to 64):
+ Number of convolution kernels of first `MimiConv1d` down sampling layer.
+ num_residual_layers (`int`, *optional*, defaults to 1):
+ Number of residual layers.
+ upsampling_ratios (`Sequence[int]`, *optional*):
+ Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it
+ will use the ratios in the reverse order to the ones specified here that must match the decoder order.
+ If not specified, will defaults to `[8, 6, 5, 4]`
+ kernel_size (`int`, *optional*, defaults to 7):
+ Kernel size for the initial convolution.
+ last_kernel_size (`int`, *optional*, defaults to 3):
+ Kernel size for the last convolution layer.
+ residual_kernel_size (`int`, *optional*, defaults to 3):
+ Kernel size for the residual layers.
+ dilation_growth_rate (`int`, *optional*, defaults to 2):
+ How much to increase the dilation with each layer.
+ use_causal_conv (`bool`, *optional*, defaults to `True`):
+ Whether to use fully causal convolution.
+ pad_mode (`str`, *optional*, defaults to `"constant"`):
+ Padding mode for the convolutions.
+ compress (`int`, *optional*, defaults to 2):
+ Reduced dimensionality in residual branches.
+ trim_right_ratio (`float`, *optional*, defaults to 1.0):
+ Ratio for trimming at the right of the transposed convolution under the `use_causal_conv = True` setup. If
+ equal to 1.0, it means that all the trimming is done at the right.
+ codebook_size (`int`, *optional*, defaults to 2048):
+ Number of discret codes in each codebooks.
+ codebook_dim (`int`, *optional*, defaults to 256):
+ Dimension of the unquantized codebook vectors. If not defined, uses `hidden_size`.
+ num_quantizers (`int`, *optional*, defaults to 32):
+ Number of quantizer channels, or codebooks, in the quantizer.
+ use_conv_shortcut (`bool`, *optional*, defaults to `False`):
+ Whether to use a convolutional layer as the 'skip' connection in the `MimiResnetBlock` block. If False,
+ an identity function will be used, giving a generic residual connection.
+ vector_quantization_hidden_dimension (`int`, *optional*, defaults to 256):
+ Intermediate representation dimension in the residual vector quantization space.
+ num_semantic_quantizers (`int`, *optional*, defaults to 1):
+ Number of semantic quantizer channels, or codebooks, in the semantic quantizer. Must be lower than `num_quantizers`.
+ upsample_groups (`int`, *optional*, defaults to 512):
+ If `frame_rate!=encodec_frame_rate`, indicates the number of groups used in the upsampling operation to go from one rate to another.
+ num_hidden_layers (`int`, *optional*, defaults to 8):
+ Number of hidden layers in the Transformer models.
+ intermediate_size (`int`, *optional*, defaults to 2048):
+ Dimension of the MLP representations.
+ num_attention_heads (`int`, *optional*, defaults to 8):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ num_key_value_heads (`int`, *optional*, defaults to 8):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+ head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
+ The attention head dimension.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 8000):
+ The maximum sequence length that this model might ever be used with. Mimi's sliding window attention
+ allows sequence of up to 8000 tokens.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ norm_eps (`float`, *optional*, defaults to 1e-05):
+ The epsilon used by the LayerNorm normalization layers.
+ use_cache (`bool`, *optional*, defaults to `False`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ rope_theta (`float`, *optional*, defaults to 10000.0):
+ The base period of the RoPE embeddings.
+ sliding_window (`int`, *optional*, defaults to 250):
+ Sliding window attention window size. If not specified, will default to `250`.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ layer_scale_initial_scale (`float`, *optional*, defaults to 0.01):
+ Initiale scale of the residual rescaling operation done in the Transformer models.
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
+ Example:
+
+ ```python
+ >>> from transformers import MimiModel, MimiConfig
+
+ >>> # Initializing a "kyutai/mimi" style configuration
+ >>> configuration = MimiConfig()
+
+ >>> # Initializing a model (with random weights) from the "kyutai/mimi" style configuration
+ >>> model = MimiModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "mimi"
+
+ def __init__(
+ self,
+ sampling_rate=24_000,
+ frame_rate=12.5,
+ audio_channels=1,
+ hidden_size=512,
+ num_filters=64,
+ num_residual_layers=1,
+ upsampling_ratios=None,
+ kernel_size=7,
+ last_kernel_size=3,
+ residual_kernel_size=3,
+ dilation_growth_rate=2,
+ use_causal_conv=True,
+ pad_mode="constant",
+ compress=2,
+ trim_right_ratio=1.0,
+ codebook_size=2048,
+ codebook_dim=256,
+ num_quantizers=32,
+ use_conv_shortcut=False,
+ vector_quantization_hidden_dimension=256,
+ num_semantic_quantizers=1,
+ upsample_groups=512,
+ num_hidden_layers=8,
+ intermediate_size=2048,
+ num_attention_heads=8,
+ num_key_value_heads=8,
+ head_dim=None,
+ hidden_act="gelu",
+ max_position_embeddings=8000,
+ initializer_range=0.02,
+ norm_eps=1e-5,
+ use_cache=False,
+ rope_theta=10000.0,
+ sliding_window=250,
+ attention_dropout=0.0,
+ layer_scale_initial_scale=0.01,
+ attention_bias=False,
+ **kwargs,
+ ):
+ self.sampling_rate = sampling_rate
+ self.frame_rate = frame_rate
+ self.audio_channels = audio_channels
+ self.hidden_size = hidden_size
+ self.num_filters = num_filters
+ self.num_residual_layers = num_residual_layers
+ self.upsampling_ratios = upsampling_ratios if upsampling_ratios else [8, 6, 5, 4]
+ self.kernel_size = kernel_size
+ self.last_kernel_size = last_kernel_size
+ self.residual_kernel_size = residual_kernel_size
+ self.dilation_growth_rate = dilation_growth_rate
+ self.use_causal_conv = use_causal_conv
+ self.pad_mode = pad_mode
+ self.compress = compress
+ self.trim_right_ratio = trim_right_ratio
+ self.codebook_size = codebook_size
+ self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
+ self.num_quantizers = num_quantizers
+ self.use_conv_shortcut = use_conv_shortcut
+ self.vector_quantization_hidden_dimension = vector_quantization_hidden_dimension
+ self.upsample_groups = upsample_groups
+ self.num_hidden_layers = num_hidden_layers
+ self.intermediate_size = intermediate_size
+ self.num_attention_heads = num_attention_heads
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.max_position_embeddings = max_position_embeddings
+ self.initializer_range = initializer_range
+ self.norm_eps = norm_eps
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.sliding_window = sliding_window
+ self.attention_dropout = attention_dropout
+ self.head_dim = head_dim or hidden_size // num_attention_heads
+ self.layer_scale_initial_scale = layer_scale_initial_scale
+ self.attention_bias = attention_bias
+
+ if num_semantic_quantizers >= self.num_quantizers:
+ raise ValueError(
+ f"The number of semantic quantizers should be lower than the total number of quantizers {self.num_quantizers}, but is currently {num_semantic_quantizers}."
+ )
+ self.num_semantic_quantizers = num_semantic_quantizers
+ super().__init__(**kwargs)
+
+ @property
+ def encodec_frame_rate(self) -> int:
+ hop_length = np.prod(self.upsampling_ratios)
+ return math.ceil(self.sampling_rate / hop_length)
+
+ @property
+ def num_codebooks(self) -> int:
+ # alias to num_quantizers
+ return self.num_quantizers
diff --git a/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py b/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
new file mode 100644
index 00000000000000..c617fa036c5d47
--- /dev/null
+++ b/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Mimi checkpoints."""
+
+import argparse
+
+import safetensors
+import torch
+
+from transformers import (
+ EncodecFeatureExtractor,
+ MimiConfig,
+ MimiModel,
+ logging,
+)
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.mimi")
+
+
+def assert_param_count(model_1, model_2):
+ count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
+ count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
+ assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
+
+
+def param_count(model):
+ return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
+
+
+def _grab_best_device(use_gpu=True):
+ if torch.cuda.device_count() > 0 and use_gpu:
+ device = "cuda"
+ else:
+ device = "cpu"
+ return torch.device(device)
+
+
+convert_list = [
+ # GENERAL
+ ("conv.conv.conv", "conv"),
+ ("convtr.convtr.convtr", "conv"),
+ ("conv.conv", "conv"),
+ ("convtr.convtr", "conv"),
+ # QUANTIZER
+ ("quantizer.rvq_first.vq", "quantizer.semantic_residual_vector_quantizer"),
+ ("quantizer.rvq_first", "quantizer.semantic_residual_vector_quantizer"),
+ ("quantizer.rvq_rest.vq", "quantizer.acoustic_residual_vector_quantizer"),
+ ("quantizer.rvq_rest", "quantizer.acoustic_residual_vector_quantizer"),
+ ("_codebook", "codebook"),
+ ("_initialized", "initialized"),
+ ("embedding_sum", "embed_sum"),
+ # ENCODER PART
+ ("encoder.model", "encoder.layers"),
+ ("decoder.model", "decoder.layers"),
+ # TRANSFORMERS PART
+ ("encoder_transformer.transformer", "encoder_transformer"),
+ ("decoder_transformer.transformer", "decoder_transformer"),
+ ("linear1", "mlp.fc1"),
+ ("linear2", "mlp.fc2"),
+ ("self_attn.out_proj", "self_attn.o_proj"),
+ ("norm1", "input_layernorm"),
+ ("norm2", "post_attention_layernorm"),
+ ("layer_scale_1", "self_attn_layer_scale"),
+ ("layer_scale_2", "mlp_layer_scale"),
+]
+
+
+def _convert_model(
+ state_dict,
+ hf_model,
+ convert_list,
+ device,
+ config,
+ unwanted_prefix=None,
+):
+ hidden_size = config.hidden_size
+ head_dim = config.head_dim
+ num_heads = int(config.hidden_size // config.head_dim)
+ num_key_value_heads = config.num_key_value_heads
+ key_value_head_dim = config.num_key_value_heads * head_dim
+
+ # permute for sliced rotary
+ def permute(w, n_heads, dim1=hidden_size, dim2=hidden_size):
+ return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+ for k, v in list(state_dict.items()):
+ new_k = k if unwanted_prefix is None else k[len(unwanted_prefix) :]
+ for old_layer_name, new_layer_name in convert_list:
+ if old_layer_name in new_k:
+ new_k = new_k.replace(old_layer_name, new_layer_name)
+
+ if "in_proj_weight" in new_k:
+ # split qkv into query key and value
+ mixed_qkv = state_dict.pop(k)
+ qkv_dim = mixed_qkv.size(0) // 3
+
+ query_layer = mixed_qkv[:qkv_dim]
+ key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+ value_layer = mixed_qkv[qkv_dim * 2 :]
+
+ state_dict[new_k.replace("in_proj_weight", "q_proj.weight")] = permute(query_layer, num_heads)
+ state_dict[new_k.replace("in_proj_weight", "k_proj.weight")] = permute(
+ key_layer, num_key_value_heads, dim1=key_value_head_dim
+ )
+ state_dict[new_k.replace("in_proj_weight", "v_proj.weight")] = value_layer
+ else:
+ state_dict[new_k] = state_dict.pop(k)
+
+ extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
+ missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
+ if len(extra_keys) != 0:
+ raise ValueError(f"extra keys found: {extra_keys}")
+ if len(missing_keys) != 0:
+ raise ValueError(f"missing keys: {missing_keys}")
+ hf_model.load_state_dict(state_dict, strict=True)
+ n_params = param_count(hf_model)
+
+ logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
+
+ hf_model.eval()
+ hf_model.to(device)
+ del state_dict
+
+ return hf_model
+
+
+@torch.no_grad()
+def convert_checkpoint(
+ checkpoint_path,
+ pytorch_dump_folder_path,
+ config_path=None,
+ repo_id=None,
+):
+ """
+ Copy/paste/tweak model's weights to transformers design.
+ """
+ device = _grab_best_device()
+
+ if config_path is not None:
+ config = MimiConfig.from_pretrained(config_path)
+ else:
+ config = MimiConfig()
+
+ model = MimiModel(config)
+
+ feature_extractor = EncodecFeatureExtractor(
+ feature_size=config.audio_channels,
+ sampling_rate=config.sampling_rate,
+ )
+ feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+ original_checkpoint = safetensors.torch.load_file(checkpoint_path)
+ if "best_state" in original_checkpoint:
+ # we might have a training state saved, in which case discard the yaml results and just retain the weights
+ original_checkpoint = original_checkpoint["best_state"]
+
+ model = _convert_model(original_checkpoint, model, convert_list, device, config)
+
+ model.save_pretrained(pytorch_dump_folder_path)
+
+ if repo_id:
+ print("Pushing to the hub...")
+ feature_extractor.push_to_hub(repo_id)
+ model.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+ parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+ parser.add_argument(
+ "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+ )
+ parser.add_argument(
+ "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+ )
+
+ args = parser.parse_args()
+ convert_checkpoint(
+ args.checkpoint_path,
+ args.pytorch_dump_folder_path,
+ args.config_path,
+ args.push_to_hub,
+ )
diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py
new file mode 100644
index 00000000000000..db36250b3d89df
--- /dev/null
+++ b/src/transformers/models/mimi/modeling_mimi.py
@@ -0,0 +1,1722 @@
+# coding=utf-8
+# Copyright 2024 Kyutai, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Mimi model."""
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ is_flash_attn_2_available,
+ is_flash_attn_greater_or_equal_2_10,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_mimi import MimiConfig
+
+
+if is_flash_attn_2_available():
+ from ...modeling_flash_attention_utils import _flash_attention_forward
+
+logger = logging.get_logger(__name__)
+
+
+# General docstring
+_CONFIG_FOR_DOC = "MimiConfig"
+
+
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+ attention_mask: torch.Tensor,
+ sequence_length: int,
+ target_length: int,
+ dtype: torch.dtype,
+ device: torch.device,
+ min_dtype: float,
+ cache_position: torch.Tensor,
+ batch_size: int,
+):
+ """
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+ Args:
+ attention_mask (`torch.Tensor`):
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+ sequence_length (`int`):
+ The sequence length being processed.
+ target_length (`int`):
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+ dtype (`torch.dtype`):
+ The dtype to use for the 4D attention mask.
+ device (`torch.device`):
+ The device to plcae the 4D attention mask on.
+ min_dtype (`float`):
+ The minimum value representable with the dtype `dtype`.
+ cache_position (`torch.Tensor`):
+ Indices depicting the position of the input sequence tokens in the sequence.
+ batch_size (`torch.Tensor`):
+ Batch size.
+ """
+ if attention_mask is not None and attention_mask.dim() == 4:
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+ causal_mask = attention_mask
+ else:
+ causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+ if sequence_length != 1:
+ causal_mask = torch.triu(causal_mask, diagonal=1)
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+ if attention_mask is not None:
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
+ mask_length = attention_mask.shape[-1]
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+ padding_mask = padding_mask == 0
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+ padding_mask, min_dtype
+ )
+
+ return causal_mask
+
+
+@dataclass
+class MimiOutput(ModelOutput):
+ """
+ Args:
+ audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+ Discret code embeddings computed using `model.encode`.
+ audio_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*)
+ Decoded audio values, obtained using the decoder part of Mimi.
+ encoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ decoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ """
+
+ audio_codes: torch.LongTensor = None
+ audio_values: torch.FloatTensor = None
+ encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+ decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+
+
+@dataclass
+class MimiEncoderOutput(ModelOutput):
+ """
+ Args:
+ audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+ Discret code embeddings computed using `model.encode`.
+ encoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ """
+
+ audio_codes: torch.LongTensor = None
+ encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+
+
+@dataclass
+class MimiDecoderOutput(ModelOutput):
+ """
+ Args:
+ audio_values (`torch.FloatTensor` of shape `(batch_size, segment_length)`, *optional*):
+ Decoded audio values, obtained using the decoder part of Mimi.
+ decoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ """
+
+ audio_values: torch.FloatTensor = None
+ decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+
+
+class MimiConv1d(nn.Module):
+ """Conv1d with asymmetric or causal padding and normalization."""
+
+ def __init__(
+ self,
+ config,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int,
+ stride: int = 1,
+ dilation: int = 1,
+ groups: int = 1,
+ pad_mode=None,
+ bias: bool = True,
+ ):
+ super().__init__()
+ self.causal = config.use_causal_conv
+ self.pad_mode = config.pad_mode if pad_mode is None else pad_mode
+
+ # warn user on unusual setup between dilation and stride
+ if stride > 1 and dilation > 1:
+ logger.warning(
+ "MimiConv1d has been initialized with stride > 1 and dilation > 1"
+ f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+ )
+
+ self.conv = nn.Conv1d(
+ in_channels, out_channels, kernel_size, stride, dilation=dilation, groups=groups, bias=bias
+ )
+
+ kernel_size = self.conv.kernel_size[0]
+ stride = torch.tensor(self.conv.stride[0], dtype=torch.int64)
+ dilation = self.conv.dilation[0]
+
+ # Effective kernel size with dilations.
+ kernel_size = torch.tensor((kernel_size - 1) * dilation + 1, dtype=torch.int64)
+
+ self.register_buffer("stride", stride, persistent=False)
+ self.register_buffer("kernel_size", kernel_size, persistent=False)
+ self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
+
+ # Asymmetric padding required for odd strides
+ self.padding_right = self.padding_total // 2
+ self.padding_left = self.padding_total - self.padding_right
+
+ def apply_weight_norm(self):
+ weight_norm = nn.utils.weight_norm
+ if hasattr(nn.utils.parametrizations, "weight_norm"):
+ weight_norm = nn.utils.parametrizations.weight_norm
+
+ weight_norm(self.conv)
+
+ def remove_weight_norm(self):
+ nn.utils.remove_weight_norm(self.conv)
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecConv1d._get_extra_padding_for_conv1d
+ def _get_extra_padding_for_conv1d(
+ self,
+ hidden_states: torch.Tensor,
+ ) -> torch.Tensor:
+ """See `pad_for_conv1d`."""
+ length = hidden_states.shape[-1]
+ n_frames = (length - self.kernel_size + self.padding_total) / self.stride + 1
+ n_frames = torch.ceil(n_frames).to(torch.int64) - 1
+ ideal_length = n_frames * self.stride + self.kernel_size - self.padding_total
+
+ return ideal_length - length
+
+ @staticmethod
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecConv1d._pad1d
+ def _pad1d(hidden_states: torch.Tensor, paddings: Tuple[int, int], mode: str = "zero", value: float = 0.0):
+ """Tiny wrapper around torch.nn.functional.pad, just to allow for reflect padding on small input.
+ If this is the case, we insert extra 0 padding to the right before the reflection happens.
+ """
+ length = hidden_states.shape[-1]
+ padding_left, padding_right = paddings
+ if not mode == "reflect":
+ return nn.functional.pad(hidden_states, paddings, mode, value)
+
+ max_pad = max(padding_left, padding_right)
+ extra_pad = 0
+ if length <= max_pad:
+ extra_pad = max_pad - length + 1
+ hidden_states = nn.functional.pad(hidden_states, (0, extra_pad))
+ padded = nn.functional.pad(hidden_states, paddings, mode, value)
+ end = padded.shape[-1] - extra_pad
+ return padded[..., :end]
+
+ def forward(self, hidden_states):
+ extra_padding = self._get_extra_padding_for_conv1d(hidden_states)
+
+ if self.causal:
+ # Left padding for causal
+ hidden_states = self._pad1d(hidden_states, (self.padding_total, extra_padding), mode=self.pad_mode)
+ else:
+ hidden_states = self._pad1d(
+ hidden_states, (self.padding_left, self.padding_right + extra_padding), mode=self.pad_mode
+ )
+
+ hidden_states = self.conv(hidden_states)
+ return hidden_states
+
+
+class MimiConvTranspose1d(nn.Module):
+ """ConvTranspose1d with asymmetric or causal padding and normalization."""
+
+ def __init__(
+ self,
+ config,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int,
+ stride: int = 1,
+ groups: int = 1,
+ bias=True,
+ ):
+ super().__init__()
+ self.causal = config.use_causal_conv
+ self.trim_right_ratio = config.trim_right_ratio
+ self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride, groups=groups, bias=bias)
+
+ if not (self.causal or self.trim_right_ratio == 1.0):
+ raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")
+
+ kernel_size = self.conv.kernel_size[0]
+ stride = self.conv.stride[0]
+ padding_total = kernel_size - stride
+
+ # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+ # removed at the very end, when keeping only the right length for the output,
+ # as removing it here would require also passing the length at the matching layer
+ # in the encoder.
+ if self.causal:
+ # Trim the padding on the right according to the specified ratio
+ # if trim_right_ratio = 1.0, trim everything from right
+ self.padding_right = math.ceil(padding_total * self.trim_right_ratio)
+ else:
+ # Asymmetric padding required for odd strides
+ self.padding_right = padding_total // 2
+
+ self.padding_left = padding_total - self.padding_right
+
+ def apply_weight_norm(self):
+ weight_norm = nn.utils.weight_norm
+ if hasattr(nn.utils.parametrizations, "weight_norm"):
+ weight_norm = nn.utils.parametrizations.weight_norm
+
+ weight_norm(self.conv)
+
+ def remove_weight_norm(self):
+ nn.utils.remove_weight_norm(self.conv)
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+
+ # unpad
+ end = hidden_states.shape[-1] - self.padding_right
+ hidden_states = hidden_states[..., self.padding_left : end]
+ return hidden_states
+
+
+# Copied from transformers.models.encodec.modeling_encodec.EncodecResnetBlock with Encodec->Mimi,EnCodec->Mimi
+class MimiResnetBlock(nn.Module):
+ """
+ Residual block from SEANet model as used by Mimi.
+ """
+
+ def __init__(self, config: MimiConfig, dim: int, dilations: List[int]):
+ super().__init__()
+ kernel_sizes = (config.residual_kernel_size, 1)
+ if len(kernel_sizes) != len(dilations):
+ raise ValueError("Number of kernel sizes should match number of dilations")
+
+ hidden = dim // config.compress
+ block = []
+ for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+ in_chs = dim if i == 0 else hidden
+ out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+ block += [nn.ELU()]
+ block += [MimiConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
+ self.block = nn.ModuleList(block)
+
+ if config.use_conv_shortcut:
+ self.shortcut = MimiConv1d(config, dim, dim, kernel_size=1)
+ else:
+ self.shortcut = nn.Identity()
+
+ def forward(self, hidden_states):
+ residual = hidden_states
+ for layer in self.block:
+ hidden_states = layer(hidden_states)
+
+ return self.shortcut(residual) + hidden_states
+
+
+class MimiEncoder(nn.Module):
+ """SEANet encoder as used by Mimi."""
+
+ def __init__(self, config: MimiConfig):
+ super().__init__()
+ model = [MimiConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
+ scaling = 1
+
+ # Downsample to raw audio scale
+ for ratio in reversed(config.upsampling_ratios):
+ current_scale = scaling * config.num_filters
+ # Add residual layers
+ for j in range(config.num_residual_layers):
+ model += [MimiResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
+ # Add downsampling layers
+ model += [nn.ELU()]
+ model += [MimiConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
+ scaling *= 2
+
+ model += [nn.ELU()]
+ model += [MimiConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]
+
+ self.layers = nn.ModuleList(model)
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecEncoder.forward
+ def forward(self, hidden_states):
+ for layer in self.layers:
+ hidden_states = layer(hidden_states)
+ return hidden_states
+
+
+class MimiLayerScale(nn.Module):
+ """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+ This rescales diagonally the residual outputs close to 0, with a learnt scale.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ channels = config.hidden_size
+ initial_scale = config.layer_scale_initial_scale
+ self.scale = nn.Parameter(torch.full((channels,), initial_scale, requires_grad=True))
+
+ def forward(self, x: torch.Tensor):
+ return self.scale * x
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mimi
+class MimiRotaryEmbedding(nn.Module):
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+ super().__init__()
+
+ self.dim = dim
+ self.max_position_embeddings = max_position_embeddings
+ self.base = base
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+ @torch.no_grad()
+ # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward
+ # TODO(joao): add me back asap :)
+ def forward(self, x, position_ids):
+ # x: [bs, num_attention_heads, seq_len, head_size]
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+ position_ids_expanded = position_ids[:, None, :].float()
+ # Force float32 since bfloat16 loses precision on long contexts
+ # See https://github.com/huggingface/transformers/pull/29285
+ device_type = x.device.type
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False):
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos()
+ sin = emb.sin()
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`, *optional*):
+ Deprecated and unused.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class MimiMLP(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.activation_fn = ACT2FN[config.hidden_act]
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+
+ # Copied from transformers.models.clip.modeling_clip.CLIPMLP.forward
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.fc1(hidden_states)
+ hidden_states = self.activation_fn(hidden_states)
+ hidden_states = self.fc2(hidden_states)
+ return hidden_states
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaAttention with Gemma->Mimi
+class MimiAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: MimiConfig, layer_idx: Optional[int] = None):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ if layer_idx is None:
+ logger.warning_once(
+ f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+ "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+ "when creating this class."
+ )
+
+ self.attention_dropout = config.attention_dropout
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = config.head_dim
+ self.num_key_value_heads = config.num_key_value_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+ self.is_causal = True
+ self.scaling = 1 / math.sqrt(config.head_dim)
+
+ if self.hidden_size % self.num_heads != 0:
+ raise ValueError(
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+ f" and `num_heads`: {self.num_heads})."
+ )
+
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+ self.rotary_emb = MimiRotaryEmbedding(
+ self.head_dim,
+ max_position_embeddings=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ self.sliding_window = config.sliding_window # Ignore copy
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+
+ if attention_mask is not None: # no matter the length, we just slice it
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+ attn_weights = attn_weights + causal_mask
+
+ # upcast attention to fp32
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+ raise ValueError(
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+ f" {attn_output.size()}"
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+
+ attn_output = attn_output.view(bsz, q_len, -1)
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaFlashAttention2 with Gemma->Mimi
+class MimiFlashAttention2(MimiAttention):
+ """
+ Mimi flash attention module. This module inherits from `MimiAttention` as the weights of the module stays
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+ flash attention and deal with padding tokens in case the input contains any of them.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.LongTensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if isinstance(past_key_value, StaticCache):
+ raise ValueError(
+ "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+ "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+ )
+
+ output_attentions = False
+
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ # Flash attention requires the input to have the shape
+ # batch_size x seq_length x head_dim x hidden_dim
+ # therefore we just need to keep the original shape
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+ # to be able to avoid many of these transpose/reshape/view.
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+
+ dropout_rate = self.attention_dropout if self.training else 0.0
+
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
+ # cast them back in the correct dtype just to be sure everything works as expected.
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+ # in fp32. (MimiRMSNorm handles it correctly)
+
+ input_dtype = query_states.dtype
+ if input_dtype == torch.float32:
+ if torch.is_autocast_enabled():
+ target_dtype = torch.get_autocast_gpu_dtype()
+ # Handle the case where the model is quantized
+ elif hasattr(self.config, "_pre_quantization_dtype"):
+ target_dtype = self.config._pre_quantization_dtype
+ else:
+ target_dtype = self.q_proj.weight.dtype
+
+ logger.warning_once(
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+ f" {target_dtype}."
+ )
+
+ query_states = query_states.to(target_dtype)
+ key_states = key_states.to(target_dtype)
+ value_states = value_states.to(target_dtype)
+
+ attn_output = _flash_attention_forward(
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ q_len,
+ position_ids=position_ids,
+ dropout=dropout_rate,
+ sliding_window=getattr(self, "sliding_window", None),
+ is_causal=self.is_causal,
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
+ )
+
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+ attn_output = self.o_proj(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights, past_key_value
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaSdpaAttention with Gemma->Mimi
+class MimiSdpaAttention(MimiAttention):
+ """
+ Mimi attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+ `MimiAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+ SDPA API.
+ """
+
+ # Adapted from MimiAttention.forward
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if output_attentions:
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+ logger.warning_once(
+ "MimiModel is using MimiSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ return super().forward(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ )
+
+ bsz, q_len, _ = hidden_states.size()
+
+ query_states = self.q_proj(hidden_states)
+ key_states = self.k_proj(hidden_states)
+ value_states = self.v_proj(hidden_states)
+
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+ cos, sin = self.rotary_emb(value_states, position_ids)
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+ if past_key_value is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+ causal_mask = attention_mask
+ if attention_mask is not None:
+ causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
+ if query_states.device.type == "cuda" and causal_mask is not None:
+ query_states = query_states.contiguous()
+ key_states = key_states.contiguous()
+ value_states = value_states.contiguous()
+
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ is_causal = True if causal_mask is None and q_len > 1 else False
+
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
+ query_states,
+ key_states,
+ value_states,
+ attn_mask=causal_mask,
+ dropout_p=self.attention_dropout if self.training else 0.0,
+ is_causal=is_causal,
+ )
+
+ attn_output = attn_output.transpose(1, 2).contiguous()
+ attn_output = attn_output.view(bsz, q_len, -1)
+
+ attn_output = self.o_proj(attn_output)
+
+ return attn_output, None, past_key_value
+
+
+MIMI_ATTENTION_CLASSES = {
+ "eager": MimiAttention,
+ "flash_attention_2": MimiFlashAttention2,
+ "sdpa": MimiSdpaAttention,
+}
+
+
+class MimiTransformerLayer(nn.Module):
+ def __init__(self, config: MimiConfig, layer_idx: int):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+
+ self.self_attn = MIMI_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+ self.mlp = MimiMLP(config)
+ self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+ self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+ self.self_attn_layer_scale = MimiLayerScale(config)
+ self.mlp_layer_scale = MimiLayerScale(config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*):
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+ query_sequence_length, key_sequence_length)` if default attention is used.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+ Indices depicting the position of the input sequence tokens in the sequence
+ kwargs (`dict`, *optional*):
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+ into the model
+ """
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+ hidden_states = residual + self.self_attn_layer_scale(hidden_states)
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + self.mlp_layer_scale(hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ return outputs
+
+
+class MimiTransformerModel(nn.Module):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MimiTransformerLayer`]
+
+ Args:
+ config: MimiConfig
+ """
+
+ def __init__(self, config: MimiConfig):
+ super().__init__()
+
+ self.layers = nn.ModuleList(
+ [MimiTransformerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+ )
+ self._attn_implementation = config._attn_implementation
+
+ self.gradient_checkpointing = False
+ self.config = config
+
+ def forward(
+ self,
+ hidden_states: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Embedded representation that will be contextualized by the model
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+ `past_key_values`).
+
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ Two formats are allowed:
+ - a [`~cache_utils.Cache`] instance;
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+ cache format.
+
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+ legacy cache format will be returned.
+
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if self.gradient_checkpointing and self.training and use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+ )
+ use_cache = False
+
+ if use_cache and past_key_values is None and not self.training:
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+
+ if cache_position is None:
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ cache_position = torch.arange(
+ past_seen_tokens, past_seen_tokens + hidden_states.shape[1], device=hidden_states.device
+ )
+
+ if position_ids is None:
+ position_ids = cache_position.unsqueeze(0)
+
+ causal_mask = None
+ if attention_mask is not None:
+ causal_mask = self._update_causal_mask(
+ attention_mask, hidden_states, cache_position, past_key_values, output_attentions
+ )
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ next_decoder_cache = None
+
+ for decoder_layer in self.layers:
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ causal_mask,
+ position_ids,
+ past_key_values,
+ output_attentions,
+ use_cache,
+ cache_position,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=causal_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ )
+
+ # Copied from transformers.models.gemma.modeling_gemma.GemmaModel._update_causal_mask
+ def _update_causal_mask(
+ self,
+ attention_mask: torch.Tensor,
+ input_tensor: torch.Tensor,
+ cache_position: torch.Tensor,
+ past_key_values: Cache,
+ output_attentions: bool,
+ ):
+ if self.config._attn_implementation == "flash_attention_2":
+ if attention_mask is not None and 0.0 in attention_mask:
+ return attention_mask
+ return None
+
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+ # to infer the attention mask.
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ using_static_cache = isinstance(past_key_values, StaticCache)
+
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+ if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
+ attention_mask,
+ inputs_embeds=input_tensor,
+ past_key_values_length=past_seen_tokens,
+ is_training=self.training,
+ ):
+ return None
+
+ dtype, device = input_tensor.dtype, input_tensor.device
+ min_dtype = torch.finfo(dtype).min
+ sequence_length = input_tensor.shape[1]
+ if using_static_cache:
+ target_length = past_key_values.get_max_length()
+ else:
+ target_length = (
+ attention_mask.shape[-1]
+ if isinstance(attention_mask, torch.Tensor)
+ else past_seen_tokens + sequence_length + 1
+ )
+
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+ causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+ attention_mask,
+ sequence_length=sequence_length,
+ target_length=target_length,
+ dtype=dtype,
+ device=device,
+ min_dtype=min_dtype,
+ cache_position=cache_position,
+ batch_size=input_tensor.shape[0],
+ )
+ if (
+ self.config._attn_implementation == "sdpa"
+ and attention_mask is not None
+ and attention_mask.device.type == "cuda"
+ and not output_attentions
+ ):
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+ # Details: https://github.com/pytorch/pytorch/issues/110213
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+ return causal_mask
+
+
+class MimiDecoder(nn.Module):
+ """SEANet decoder as used by Mimi."""
+
+ def __init__(self, config: MimiConfig):
+ super().__init__()
+ scaling = int(2 ** len(config.upsampling_ratios))
+ model = [MimiConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]
+
+ # Upsample to raw audio scale
+ for ratio in config.upsampling_ratios:
+ current_scale = scaling * config.num_filters
+ # Add upsampling layers
+ model += [nn.ELU()]
+ model += [
+ MimiConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
+ ]
+ # Add residual layers
+ for j in range(config.num_residual_layers):
+ model += [MimiResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
+ scaling //= 2
+
+ # Add final layers
+ model += [nn.ELU()]
+ model += [MimiConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
+ self.layers = nn.ModuleList(model)
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecDecoder.forward
+ def forward(self, hidden_states):
+ for layer in self.layers:
+ hidden_states = layer(hidden_states)
+ return hidden_states
+
+
+class MimiEuclideanCodebook(nn.Module):
+ """Codebook with Euclidean distance."""
+
+ def __init__(self, config: MimiConfig, epsilon: float = 1e-5):
+ super().__init__()
+ embed = torch.zeros(config.codebook_size, config.codebook_dim)
+
+ self.codebook_size = config.codebook_size
+
+ self.register_buffer("initialized", torch.Tensor([True]))
+ self.register_buffer("cluster_usage", torch.ones(config.codebook_size))
+ self.register_buffer("embed_sum", embed)
+ self._embed = None
+ self.epsilon = epsilon
+
+ @property
+ def embed(self) -> torch.Tensor:
+ if self._embed is None:
+ self._embed = self.embed_sum / self.cluster_usage.clamp(min=self.epsilon)[:, None]
+ return self._embed
+
+ def quantize(self, hidden_states):
+ # Projects each vector in `hidden_states` over the nearest centroid and return its index.
+ # `hidden_states` should be `[N, D]` with `N` the number of input vectors and `D` the dimension.
+ dists = torch.cdist(hidden_states[None], self.embed[None], p=2)[0]
+ embed_ind = dists.argmin(dim=-1)
+ return embed_ind
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook.encode
+ def encode(self, hidden_states):
+ shape = hidden_states.shape
+ # pre-process
+ hidden_states = hidden_states.reshape((-1, shape[-1]))
+ # quantize
+ embed_ind = self.quantize(hidden_states)
+ # post-process
+ embed_ind = embed_ind.view(*shape[:-1])
+ return embed_ind
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook.decode
+ def decode(self, embed_ind):
+ quantize = nn.functional.embedding(embed_ind, self.embed)
+ return quantize
+
+
+# Copied from transformers.models.encodec.modeling_encodec.EncodecVectorQuantization with Encodec->Mimi
+class MimiVectorQuantization(nn.Module):
+ """
+ Vector quantization implementation. Currently supports only euclidean distance.
+ """
+
+ def __init__(self, config: MimiConfig):
+ super().__init__()
+ self.codebook = MimiEuclideanCodebook(config)
+
+ def encode(self, hidden_states):
+ hidden_states = hidden_states.permute(0, 2, 1)
+ embed_in = self.codebook.encode(hidden_states)
+ return embed_in
+
+ def decode(self, embed_ind):
+ quantize = self.codebook.decode(embed_ind)
+ quantize = quantize.permute(0, 2, 1)
+ return quantize
+
+
+class MimiResidualVectorQuantizer(nn.Module):
+ """Residual Vector Quantizer."""
+
+ def __init__(self, config: MimiConfig, num_quantizers: int = None):
+ super().__init__()
+ self.codebook_size = config.codebook_size
+ self.frame_rate = config.frame_rate
+ self.num_quantizers = num_quantizers if num_quantizers is not None else config.num_quantizers
+ self.layers = nn.ModuleList([MimiVectorQuantization(config) for _ in range(self.num_quantizers)])
+
+ self.input_proj = None
+ self.output_proj = None
+ if config.vector_quantization_hidden_dimension != config.hidden_size:
+ self.input_proj = torch.nn.Conv1d(
+ config.hidden_size, config.vector_quantization_hidden_dimension, 1, bias=False
+ )
+ self.output_proj = torch.nn.Conv1d(
+ config.vector_quantization_hidden_dimension, config.hidden_size, 1, bias=False
+ )
+
+ def encode(self, embeddings: torch.Tensor, num_quantizers: Optional[int] = None) -> torch.Tensor:
+ """
+ Encode a given input tensor with the specified frame rate at the given number of quantizers / codebooks. The RVQ encode method sets
+ the appropriate number of quantizers to use and returns indices for each quantizer.
+ """
+ if self.input_proj is not None:
+ embeddings = self.input_proj(embeddings)
+
+ num_quantizers = num_quantizers if num_quantizers is not None else self.num_quantizers
+
+ residual = embeddings
+ all_indices = []
+ for layer in self.layers[:num_quantizers]:
+ indices = layer.encode(residual)
+ quantized = layer.decode(indices)
+ residual = residual - quantized
+ all_indices.append(indices)
+ out_indices = torch.stack(all_indices)
+ return out_indices
+
+ def decode(self, codes: torch.Tensor) -> torch.Tensor:
+ """Decode the given codes of shape [B, K, T] to the quantized representation."""
+ quantized_out = torch.tensor(0.0, device=codes.device)
+ codes = codes.transpose(0, 1)
+ for i, indices in enumerate(codes):
+ layer = self.layers[i]
+ quantized = layer.decode(indices)
+ quantized_out = quantized_out + quantized
+
+ if self.output_proj is not None:
+ quantized_out = self.output_proj(quantized_out)
+ return quantized_out
+
+
+class MimiSplitResidualVectorQuantizer(nn.Module):
+ """Split Residual Vector Quantizer."""
+
+ def __init__(self, config: MimiConfig):
+ super().__init__()
+ self.codebook_size = config.codebook_size
+ self.frame_rate = config.frame_rate
+ self.max_num_quantizers = config.num_quantizers
+
+ self.num_semantic_quantizers = config.num_semantic_quantizers
+ self.num_acoustic_quantizers = config.num_quantizers - config.num_semantic_quantizers
+
+ self.semantic_residual_vector_quantizer = MimiResidualVectorQuantizer(config, self.num_semantic_quantizers)
+ self.acoustic_residual_vector_quantizer = MimiResidualVectorQuantizer(config, self.num_acoustic_quantizers)
+
+ def encode(self, embeddings: torch.Tensor, num_quantizers: Optional[float] = None) -> torch.Tensor:
+ """
+ Encode a given input tensor with the specified frame rate at the given number of quantizers / codebooks. The RVQ encode method sets
+ the appropriate number of quantizers to use and returns indices for each quantizer.
+ """
+
+ num_quantizers = self.max_num_quantizers if num_quantizers is None else num_quantizers
+
+ if num_quantizers > self.max_num_quantizers:
+ raise ValueError(
+ f"The number of quantizers (i.e codebooks) asked should be lower than the total number of quantizers {self.max_num_quantizers}, but is currently {num_quantizers}."
+ )
+
+ if num_quantizers < self.num_semantic_quantizers:
+ raise ValueError(
+ f"The number of quantizers (i.e codebooks) asked should be higher than the number of semantic quantizers {self.num_semantic_quantizers}, but is currently {num_quantizers}."
+ )
+
+ # codes is [K, B, T], with T frames, K nb of codebooks.
+ codes = self.semantic_residual_vector_quantizer.encode(embeddings)
+
+ if num_quantizers > self.num_semantic_quantizers:
+ acoustic_codes = self.acoustic_residual_vector_quantizer.encode(
+ embeddings, num_quantizers=num_quantizers - self.num_semantic_quantizers
+ )
+ codes = torch.cat([codes, acoustic_codes], dim=0)
+
+ return codes
+
+ def decode(self, codes: torch.Tensor) -> torch.Tensor:
+ """Decode the given codes to the quantized representation."""
+
+ # The first num_semantic_quantizers codebooks are decoded using the semantic RVQ
+ quantized_out = self.semantic_residual_vector_quantizer.decode(codes[:, : self.num_semantic_quantizers])
+
+ # The rest of the codebooks are decoded using the acoustic RVQ
+ if codes.shape[1] > self.num_semantic_quantizers:
+ quantized_out += self.acoustic_residual_vector_quantizer.decode(codes[:, self.num_semantic_quantizers :])
+ return quantized_out
+
+
+class MimiPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = MimiConfig
+ base_model_prefix = "mimi"
+ main_input_name = "input_values"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["MimiDecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+ _supports_cache_class = True
+ _supports_static_cache = True
+
+ # Copied from transformers.models.encodec.modeling_encodec.EncodecPreTrainedModel._init_weights
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, nn.Conv1d):
+ nn.init.kaiming_normal_(module.weight)
+ if module.bias is not None:
+ k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+ nn.init.uniform_(module.bias, a=-k, b=k)
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, nn.LSTM):
+ for name, param in module.named_parameters():
+ if "weight" in name:
+ nn.init.xavier_uniform_(param)
+ elif "bias" in name:
+ nn.init.constant_(param, 0.0)
+
+
+MIMI_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`MimiConfig`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+MIMI_INPUTS_DOCSTRING = r"""
+ Args:
+ input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
+ Raw audio input converted to Float.
+ padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+ for *masked*.
+ num_quantizers (`int`, *optional*):
+ Number of quantizers (i.e codebooks) to use. By default, all quantizers are used.
+ audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+ Discret code embeddings computed using `model.encode`.
+ encoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ decoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The Mimi neural audio codec model.",
+ MIMI_START_DOCSTRING,
+)
+class MimiModel(MimiPreTrainedModel):
+ def __init__(self, config: MimiConfig):
+ super().__init__(config)
+ self.config = config
+
+ self.encoder = MimiEncoder(config)
+ self.encoder_transformer = MimiTransformerModel(config)
+
+ self.downsample = None
+ self.upsample = None
+ if config.frame_rate != config.encodec_frame_rate:
+ self.downsample = MimiConv1d(
+ config,
+ config.hidden_size,
+ config.hidden_size,
+ kernel_size=2 * int(config.encodec_frame_rate / config.frame_rate),
+ stride=2,
+ bias=False,
+ pad_mode="replicate",
+ )
+
+ self.upsample = MimiConvTranspose1d(
+ config,
+ config.hidden_size,
+ config.hidden_size,
+ kernel_size=2 * int(config.encodec_frame_rate / config.frame_rate),
+ stride=2,
+ bias=False,
+ groups=config.upsample_groups,
+ )
+
+ self.decoder_transformer = MimiTransformerModel(config)
+ self.decoder = MimiDecoder(config)
+
+ self.quantizer = MimiSplitResidualVectorQuantizer(config)
+
+ self.bits_per_codebook = int(math.log2(self.config.codebook_size))
+ if 2**self.bits_per_codebook != self.config.codebook_size:
+ raise ValueError("The codebook_size must be a power of 2.")
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_encoder(self):
+ return self.encoder
+
+ def get_decoder(self):
+ return self.decoder
+
+ def _encode_frame(
+ self,
+ input_values: torch.Tensor,
+ num_quantizers: int,
+ padding_mask: int,
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+ """
+ Encodes the given input using the underlying VQVAE. The padding mask is required to compute the correct scale.
+ """
+ embeddings = self.encoder(input_values)
+ encoder_outputs = self.encoder_transformer(
+ embeddings.transpose(1, 2), past_key_values=past_key_values, return_dict=return_dict
+ )
+ if return_dict:
+ past_key_values = encoder_outputs.get("past_key_values")
+ elif len(encoder_outputs) > 1:
+ past_key_values = encoder_outputs[1]
+ embeddings = encoder_outputs[0].transpose(1, 2)
+ embeddings = self.downsample(embeddings)
+
+ codes = self.quantizer.encode(embeddings, num_quantizers)
+ codes = codes.transpose(0, 1)
+ return codes, past_key_values
+
+ def encode(
+ self,
+ input_values: torch.Tensor,
+ padding_mask: torch.Tensor = None,
+ num_quantizers: Optional[float] = None,
+ encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.Tensor, Optional[torch.Tensor]], MimiEncoderOutput]:
+ """
+ Encodes the input audio waveform into discrete codes.
+
+ Args:
+ input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+ Float values of the input audio waveform.
+ padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+ Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+ for *masked*.
+ num_quantizers (`int`, *optional*):
+ Number of quantizers (i.e codebooks) to use. By default, all quantizers are used.
+ encoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+ Returns:
+ `codebook` of shape `[batch_size, num_codebooks, frames]`, the discrete encoded codes for the input audio waveform.
+ """
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ num_quantizers = self.config.num_quantizers if num_quantizers is None else num_quantizers
+
+ if num_quantizers > self.config.num_quantizers:
+ raise ValueError(
+ f"The number of quantizers (i.e codebooks) asked should be lower than the total number of quantizers {self.config.num_quantizers}, but is currently {num_quantizers}."
+ )
+
+ _, channels, input_length = input_values.shape
+
+ if channels < 1 or channels > 2:
+ raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")
+
+ if padding_mask is None:
+ padding_mask = torch.ones_like(input_values).bool()
+
+ encoded_frames, encoder_past_key_values = self._encode_frame(
+ input_values,
+ num_quantizers,
+ padding_mask.bool(),
+ past_key_values=encoder_past_key_values,
+ return_dict=return_dict,
+ )
+
+ if not return_dict:
+ return (
+ encoded_frames,
+ encoder_past_key_values,
+ )
+
+ return MimiEncoderOutput(encoded_frames, encoder_past_key_values)
+
+ def _decode_frame(
+ self,
+ codes: torch.Tensor,
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ return_dict: Optional[bool] = None,
+ ) -> torch.Tensor:
+ embeddings = self.quantizer.decode(codes)
+
+ embeddings = self.upsample(embeddings)
+ decoder_outputs = self.decoder_transformer(
+ embeddings.transpose(1, 2), past_key_values=past_key_values, return_dict=return_dict
+ )
+ if return_dict:
+ past_key_values = decoder_outputs.get("past_key_values")
+ elif len(decoder_outputs) > 1:
+ past_key_values = decoder_outputs[1]
+ embeddings = decoder_outputs[0].transpose(1, 2)
+ outputs = self.decoder(embeddings)
+ return outputs, past_key_values
+
+ def decode(
+ self,
+ audio_codes: torch.Tensor,
+ padding_mask: Optional[torch.Tensor] = None,
+ decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], MimiDecoderOutput]:
+ """
+ Decodes the given frames into an output audio waveform.
+
+ Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
+ trimmed.
+
+ Args:
+ audio_codes (`torch.LongTensor` of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+ Discret code embeddings computed using `model.encode`.
+ padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+ Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+ for *masked*.
+ decoder_past_key_values (`Cache`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+ The model will output the same cache format that is fed as input.
+
+ If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+ have their past key value states given to this model).
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+ """
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ audio_values, decoder_past_key_values = self._decode_frame(
+ audio_codes, past_key_values=decoder_past_key_values, return_dict=return_dict
+ )
+
+ # truncate based on padding mask
+ if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
+ audio_values = audio_values[..., : padding_mask.shape[-1]]
+
+ if not return_dict:
+ return (
+ audio_values,
+ decoder_past_key_values,
+ )
+ return MimiDecoderOutput(audio_values, decoder_past_key_values)
+
+ @add_start_docstrings_to_model_forward(MIMI_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=MimiOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_values: torch.Tensor,
+ padding_mask: Optional[torch.Tensor] = None,
+ num_quantizers: Optional[int] = None,
+ audio_codes: Optional[torch.Tensor] = None,
+ encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], MimiOutput]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from datasets import load_dataset
+ >>> from transformers import AutoFeatureExtractor, MimiModel
+
+ >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
+ >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+ >>> model_id = "kyutai/mimi"
+ >>> model = MimiModel.from_pretrained(model_id)
+ >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
+
+ >>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> audio_codes = outputs.audio_codes
+ >>> audio_values = outputs.audio_values
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+ if padding_mask is None:
+ padding_mask = torch.ones_like(input_values).bool()
+
+ if audio_codes is None:
+ encoder_outputs = self.encode(
+ input_values, padding_mask, num_quantizers, encoder_past_key_values, return_dict=return_dict
+ )
+ audio_codes = encoder_outputs[0]
+ if return_dict:
+ encoder_past_key_values = encoder_outputs.get("past_key_values")
+ elif len(encoder_outputs) > 1:
+ encoder_past_key_values = encoder_outputs[1]
+
+ decoder_outputs = self.decode(audio_codes, padding_mask, decoder_past_key_values, return_dict=return_dict)
+ audio_values = decoder_outputs[0]
+ if return_dict:
+ decoder_past_key_values = decoder_outputs.get("past_key_values")
+ elif len(decoder_outputs) > 1:
+ decoder_past_key_values = decoder_outputs[1]
+
+ if not return_dict:
+ return (audio_codes, audio_values, encoder_past_key_values, decoder_past_key_values)
+
+ return MimiOutput(
+ audio_codes=audio_codes,
+ audio_values=audio_values,
+ encoder_past_key_values=encoder_past_key_values,
+ decoder_past_key_values=decoder_past_key_values,
+ )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 2db7b38b580375..5f8ae6b5fbffac 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5840,6 +5840,20 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
+class MimiModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class MimiPreTrainedModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
class MistralForCausalLM(metaclass=DummyObject):
_backends = ["torch"]
diff --git a/tests/models/mimi/__init__.py b/tests/models/mimi/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
new file mode 100644
index 00000000000000..dd0f77421be728
--- /dev/null
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -0,0 +1,890 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Mimi model."""
+
+import inspect
+import os
+import tempfile
+import unittest
+
+import numpy as np
+from datasets import Audio, load_dataset
+from packaging import version
+from parameterized import parameterized
+from pytest import mark
+
+from transformers import AutoFeatureExtractor, MimiConfig
+from transformers.testing_utils import (
+ is_flaky,
+ is_torch_available,
+ require_flash_attn,
+ require_torch,
+ require_torch_gpu,
+ require_torch_sdpa,
+ slow,
+ torch_device,
+)
+from transformers.utils import (
+ is_torch_bf16_available_on_device,
+ is_torch_fp16_available_on_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+ import torch
+
+ from transformers import MimiModel
+
+
+# Copied from transformers.tests.encodec.test_modeling_encodec.prepare_inputs_dict
+def prepare_inputs_dict(
+ config,
+ input_ids=None,
+ input_values=None,
+ decoder_input_ids=None,
+ attention_mask=None,
+ decoder_attention_mask=None,
+ head_mask=None,
+ decoder_head_mask=None,
+ cross_attn_head_mask=None,
+):
+ if input_ids is not None:
+ encoder_dict = {"input_ids": input_ids}
+ else:
+ encoder_dict = {"input_values": input_values}
+
+ decoder_dict = {"decoder_input_ids": decoder_input_ids} if decoder_input_ids is not None else {}
+
+ return {**encoder_dict, **decoder_dict}
+
+
+@require_torch
+class MimiModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=5,
+ num_channels=1,
+ is_training=False,
+ intermediate_size=40,
+ hidden_size=32,
+ num_filters=8,
+ num_residual_layers=1,
+ upsampling_ratios=[8, 4],
+ codebook_size=64,
+ vector_quantization_hidden_dimension=64,
+ codebook_dim=64,
+ upsample_groups=32,
+ num_hidden_layers=2,
+ num_attention_heads=2,
+ num_key_value_heads=2,
+ sliding_window=4,
+ use_cache=False,
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.num_channels = num_channels
+ self.is_training = is_training
+ self.intermediate_size = intermediate_size
+ self.hidden_size = hidden_size
+ self.num_filters = num_filters
+ self.num_residual_layers = num_residual_layers
+ self.upsampling_ratios = upsampling_ratios
+ self.codebook_size = codebook_size
+ self.vector_quantization_hidden_dimension = vector_quantization_hidden_dimension
+ self.codebook_dim = codebook_dim
+ self.upsample_groups = upsample_groups
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.num_key_value_heads = num_key_value_heads
+ self.sliding_window = sliding_window
+ self.use_cache = use_cache
+
+ def prepare_config_and_inputs(self):
+ input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0)
+ config = self.get_config()
+ inputs_dict = {"input_values": input_values}
+ return config, inputs_dict
+
+ def prepare_config_and_inputs_for_common(self):
+ config, inputs_dict = self.prepare_config_and_inputs()
+ return config, inputs_dict
+
+ def prepare_config_and_inputs_for_model_class(self, model_class):
+ config, inputs_dict = self.prepare_config_and_inputs()
+ inputs_dict["audio_codes"] = ids_tensor([self.batch_size, 1, self.num_channels], self.codebook_size).type(
+ torch.int32
+ )
+
+ return config, inputs_dict
+
+ def get_config(self):
+ return MimiConfig(
+ audio_channels=self.num_channels,
+ chunk_in_sec=None,
+ hidden_size=self.hidden_size,
+ num_filters=self.num_filters,
+ num_residual_layers=self.num_residual_layers,
+ upsampling_ratios=self.upsampling_ratios,
+ codebook_size=self.codebook_size,
+ vector_quantization_hidden_dimension=self.vector_quantization_hidden_dimension,
+ upsample_groups=self.upsample_groups,
+ num_hidden_layers=self.num_hidden_layers,
+ num_attention_heads=self.num_attention_heads,
+ num_key_value_heads=self.num_key_value_heads,
+ sliding_window=self.sliding_window,
+ codebook_dim=self.codebook_dim,
+ use_cache=self.use_cache,
+ )
+
+ def create_and_check_model_forward(self, config, inputs_dict):
+ model = MimiModel(config=config).to(torch_device).eval()
+
+ input_values = inputs_dict["input_values"]
+ result = model(input_values)
+ self.parent.assertEqual(
+ result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size)
+ )
+
+
+@require_torch
+class MimiModelTest(ModelTesterMixin, unittest.TestCase):
+ all_model_classes = (MimiModel,) if is_torch_available() else ()
+ is_encoder_decoder = True
+ test_pruning = False
+ test_headmasking = False
+ test_resize_embeddings = False
+ test_torchscript = False
+ input_name = "input_values"
+
+ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+ # model does support returning hidden states
+ inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+ if "output_attentions" in inputs_dict:
+ inputs_dict.pop("output_attentions")
+ if "output_hidden_states" in inputs_dict:
+ inputs_dict.pop("output_hidden_states")
+ return inputs_dict
+
+ def setUp(self):
+ self.model_tester = MimiModelTester(self)
+ self.config_tester = ConfigTester(
+ self, config_class=MimiConfig, hidden_size=37, common_properties=[], has_text_modality=False
+ )
+
+ def test_config(self):
+ self.config_tester.run_common_tests()
+
+ def test_model_forward(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+ def test_forward_signature(self):
+ config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ signature = inspect.signature(model.forward)
+ # signature.parameters is an OrderedDict => so arg_names order is deterministic
+ arg_names = [*signature.parameters.keys()]
+
+ expected_arg_names = ["input_values", "padding_mask", "num_quantizers"]
+ self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+ @unittest.skip(reason="The MimiModel does not have `inputs_embeds` logics")
+ def test_inputs_embeds(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have `inputs_embeds` logics")
+ def test_model_get_set_embeddings(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have the usual `attention` logic")
+ def test_retain_grad_hidden_states_attentions(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have the usual `attention` logic")
+ def test_torchscript_output_attentions(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have the usual `hidden_states` logic")
+ def test_torchscript_output_hidden_state(self):
+ pass
+
+ # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest._create_and_check_torchscript
+ def _create_and_check_torchscript(self, config, inputs_dict):
+ if not self.test_torchscript:
+ self.skipTest(reason="test_torchscript is set to False")
+
+ configs_no_init = _config_zero_init(config) # To be sure we have no Nan
+ configs_no_init.torchscript = True
+ configs_no_init.return_dict = False
+ for model_class in self.all_model_classes:
+ model = model_class(config=configs_no_init)
+ model.to(torch_device)
+ model.eval()
+ inputs = self._prepare_for_class(inputs_dict, model_class)
+
+ main_input_name = model_class.main_input_name
+
+ try:
+ main_input = inputs[main_input_name]
+ model(main_input)
+ traced_model = torch.jit.trace(model, main_input)
+ except RuntimeError:
+ self.fail("Couldn't trace module.")
+
+ with tempfile.TemporaryDirectory() as tmp_dir_name:
+ pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+ try:
+ torch.jit.save(traced_model, pt_file_name)
+ except Exception:
+ self.fail("Couldn't save module.")
+
+ try:
+ loaded_model = torch.jit.load(pt_file_name)
+ except Exception:
+ self.fail("Couldn't load module.")
+
+ model.to(torch_device)
+ model.eval()
+
+ loaded_model.to(torch_device)
+ loaded_model.eval()
+
+ model_state_dict = model.state_dict()
+ loaded_model_state_dict = loaded_model.state_dict()
+
+ non_persistent_buffers = {}
+ for key in loaded_model_state_dict.keys():
+ if key not in model_state_dict.keys():
+ non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+ loaded_model_state_dict = {
+ key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+ }
+
+ self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+ model_buffers = list(model.buffers())
+ for non_persistent_buffer in non_persistent_buffers.values():
+ found_buffer = False
+ for i, model_buffer in enumerate(model_buffers):
+ if torch.equal(non_persistent_buffer, model_buffer):
+ found_buffer = True
+ break
+
+ self.assertTrue(found_buffer)
+ model_buffers.pop(i)
+
+ model_buffers = list(model.buffers())
+ for non_persistent_buffer in non_persistent_buffers.values():
+ found_buffer = False
+ for i, model_buffer in enumerate(model_buffers):
+ if torch.equal(non_persistent_buffer, model_buffer):
+ found_buffer = True
+ break
+
+ self.assertTrue(found_buffer)
+ model_buffers.pop(i)
+
+ models_equal = True
+ for layer_name, p1 in model_state_dict.items():
+ if layer_name in loaded_model_state_dict:
+ p2 = loaded_model_state_dict[layer_name]
+ if p1.data.ne(p2.data).sum() > 0:
+ models_equal = False
+
+ self.assertTrue(models_equal)
+
+ # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+ # (Even with this call, there are still memory leak by ~0.04MB)
+ self.clear_torch_jit_class_registry()
+
+ @unittest.skip(reason="The MimiModel does not have the usual `attention` logic")
+ def test_attention_outputs(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have the usual `hidden_states` logic")
+ def test_hidden_states_output(self):
+ pass
+
+ # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_determinism
+ def test_determinism(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ def check_determinism(first, second):
+ # outputs are not tensors but list (since each sequence don't have the same frame_length)
+ out_1 = first.cpu().numpy()
+ out_2 = second.cpu().numpy()
+ out_1 = out_1[~np.isnan(out_1)]
+ out_2 = out_2[~np.isnan(out_2)]
+ max_diff = np.amax(np.abs(out_1 - out_2))
+ self.assertLessEqual(max_diff, 1e-5)
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ model.to(torch_device)
+ model.eval()
+ with torch.no_grad():
+ first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+ second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+ if isinstance(first, tuple) and isinstance(second, tuple):
+ for tensor1, tensor2 in zip(first, second):
+ check_determinism(tensor1, tensor2)
+ else:
+ check_determinism(first, second)
+
+ # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_model_outputs_equivalence
+ def test_model_outputs_equivalence(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ def set_nan_tensor_to_zero(t):
+ t[t != t] = 0
+ return t
+
+ def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+ with torch.no_grad():
+ tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+ dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs)
+
+ self.assertTrue(isinstance(tuple_output, tuple))
+ self.assertTrue(isinstance(dict_output, dict))
+
+ for tuple_value, dict_value in zip(tuple_output, dict_output.values()):
+ self.assertTrue(
+ torch.allclose(
+ set_nan_tensor_to_zero(tuple_value), set_nan_tensor_to_zero(dict_value), atol=1e-5
+ ),
+ msg=(
+ "Tuple and dict output are not equal. Difference:"
+ f" {torch.max(torch.abs(tuple_value - dict_value))}. Tuple has `nan`:"
+ f" {torch.isnan(tuple_value).any()} and `inf`: {torch.isinf(tuple_value)}. Dict has"
+ f" `nan`: {torch.isnan(dict_value).any()} and `inf`: {torch.isinf(dict_value)}."
+ ),
+ )
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ model.to(torch_device)
+ model.eval()
+
+ tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+ dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+ check_equivalence(model, tuple_inputs, dict_inputs)
+
+ def test_initialization(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ configs_no_init = _config_zero_init(config)
+ for model_class in self.all_model_classes:
+ model = model_class(config=configs_no_init)
+ for name, param in model.named_parameters():
+ uniform_init_parms = ["conv", "input_proj", "output_proj"]
+ if param.requires_grad:
+ if any(x in name for x in uniform_init_parms):
+ self.assertTrue(
+ -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+ msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+ )
+
+ # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_identity_shortcut
+ def test_identity_shortcut(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+ config.use_conv_shortcut = False
+ self.model_tester.create_and_check_model_forward(config, inputs_dict)
+
+ @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+ @require_torch_sdpa
+ @slow
+ def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+ if not self.has_attentions:
+ self.skipTest(reason="Model architecture does not support attentions")
+
+ if not self.all_model_classes[0]._supports_sdpa:
+ self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
+
+ if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
+ self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
+
+ if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
+ self.skipTest(
+ f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
+ )
+
+ # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
+ if torch_dtype == "float16":
+ torch_dtype = torch.float16
+ elif torch_dtype == "bfloat16":
+ torch_dtype = torch.bfloat16
+ elif torch_dtype == "float32":
+ torch_dtype = torch.float32
+
+ atols = {
+ ("cpu", False, torch.float32): 1e-6,
+ ("cpu", False, torch.bfloat16): 1e-2,
+ ("cpu", True, torch.float32): 1e-6,
+ ("cpu", True, torch.bfloat16): 1e-2,
+ ("cuda", False, torch.float32): 1e-6,
+ ("cuda", False, torch.bfloat16): 1e-2,
+ ("cuda", False, torch.float16): 5e-3,
+ ("cuda", True, torch.float32): 1e-6,
+ ("cuda", True, torch.bfloat16): 1e-2,
+ ("cuda", True, torch.float16): 5e-3,
+ }
+ rtols = {
+ ("cpu", False, torch.float32): 1e-4,
+ ("cpu", False, torch.bfloat16): 1e-2,
+ ("cpu", True, torch.float32): 1e-4,
+ ("cpu", True, torch.bfloat16): 1e-2,
+ ("cuda", False, torch.float32): 1e-4,
+ ("cuda", False, torch.bfloat16): 1e-2,
+ ("cuda", False, torch.float16): 5e-3,
+ ("cuda", True, torch.float32): 1e-4,
+ ("cuda", True, torch.bfloat16): 3e-2,
+ ("cuda", True, torch.float16): 5e-3,
+ }
+
+ def get_mean_reldiff(failcase, x, ref, atol, rtol):
+ return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
+
+ for model_class in self.all_model_classes:
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ model = model_class(config)
+ # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
+ # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
+ # This means that the class needs to be instantiated much later, after `use_mask` is set, which means a significant refactor of the code.
+ # However masking there is not done at any layers that matters (i.e self-attention), therefore we can safely deactivate it.
+ deactivate_mask = "use_mask_token" in inspect.signature(model_class).parameters
+
+ is_encoder_decoder = model.config.is_encoder_decoder
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_pretrained(tmpdirname)
+ model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
+ model_sdpa = model_sdpa.eval().to(torch_device)
+
+ self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+ model_eager = model_class.from_pretrained(
+ tmpdirname,
+ torch_dtype=torch_dtype,
+ attn_implementation="eager",
+ )
+ model_eager = model_eager.eval().to(torch_device)
+
+ self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+ for name, submodule in model_eager.named_modules():
+ class_name = submodule.__class__.__name__
+ if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+ raise ValueError("The eager model should not have SDPA attention layers")
+
+ has_sdpa = False
+ for name, submodule in model_sdpa.named_modules():
+ class_name = submodule.__class__.__name__
+ if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+ has_sdpa = True
+ break
+ if not has_sdpa and model_sdpa.config.model_type != "falcon":
+ raise ValueError("The SDPA model should have SDPA attention layers")
+
+ # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
+ # but it would be nicer to have an efficient way to use parameterized.expand
+ fail_cases = []
+ for padding_side in ["left", "right"]:
+ for use_mask in [False, True]:
+ for output_attentions in [True, False]:
+ can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
+ if not (self.has_attentions and can_output_attn) and output_attentions:
+ continue
+ for batch_size in [1, 5]:
+ dummy_input = inputs_dict[model.main_input_name]
+
+ if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
+ dummy_input = dummy_input.to(torch_dtype)
+
+ dummy_input = dummy_input[:batch_size]
+ if dummy_input.shape[0] != batch_size:
+ if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
+ extension = torch.rand(
+ batch_size - dummy_input.shape[0],
+ *dummy_input.shape[1:],
+ dtype=torch_dtype,
+ device=torch_device,
+ )
+ dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
+ else:
+ extension = torch.randint(
+ high=5,
+ size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]),
+ dtype=dummy_input.dtype,
+ device=torch_device,
+ )
+ dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
+
+ if not use_mask:
+ dummy_attention_mask = None
+ else:
+ dummy_attention_mask = inputs_dict.get("attention_mask", None)
+ if dummy_attention_mask is None:
+ if is_encoder_decoder:
+ seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
+ else:
+ seqlen = dummy_input.shape[-1]
+ dummy_attention_mask = (
+ torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
+ )
+
+ dummy_attention_mask = dummy_attention_mask[:batch_size]
+ if dummy_attention_mask.shape[0] != batch_size:
+ extension = torch.ones(
+ batch_size - dummy_attention_mask.shape[0],
+ *dummy_attention_mask.shape[1:],
+ dtype=dummy_attention_mask.dtype,
+ device=torch_device,
+ )
+ dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
+ dummy_attention_mask = dummy_attention_mask.to(torch_device)
+
+ dummy_attention_mask[:] = 1
+ if padding_side == "left":
+ dummy_attention_mask[-1, :-1] = 1
+ dummy_attention_mask[-1, -4:] = 0
+ elif padding_side == "right":
+ dummy_attention_mask[-1, 1:] = 1
+ dummy_attention_mask[-1, :3] = 0
+
+ for enable_kernels in [False, True]:
+ failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
+ if is_encoder_decoder:
+ decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[
+ :batch_size
+ ]
+ if decoder_input_ids.shape[0] != batch_size:
+ extension = torch.ones(
+ batch_size - decoder_input_ids.shape[0],
+ *decoder_input_ids.shape[1:],
+ dtype=decoder_input_ids.dtype,
+ device=torch_device,
+ )
+ decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
+ decoder_input_ids = decoder_input_ids.to(torch_device)
+
+ # TODO: never an `attention_mask` arg here?
+ processed_inputs = {
+ model.main_input_name: dummy_input,
+ "decoder_input_ids": decoder_input_ids,
+ "decoder_attention_mask": dummy_attention_mask,
+ "output_hidden_states": True,
+ }
+ else:
+ processed_inputs = {
+ model.main_input_name: dummy_input,
+ "output_hidden_states": True,
+ }
+
+ # Otherwise fails for e.g. WhisperEncoderModel
+ if "attention_mask" in inspect.signature(model_eager.forward).parameters:
+ processed_inputs["attention_mask"] = dummy_attention_mask
+
+ if (
+ self.has_attentions
+ and "output_attentions" in inspect.signature(model_sdpa.forward).parameters
+ ):
+ processed_inputs["output_attentions"] = output_attentions
+ if not deactivate_mask and (
+ "bool_masked_pos" in inspect.signature(model_eager.forward).parameters
+ ):
+ dummy_mask = torch.ones((self.model_tester.num_masks,))
+
+ # In case of additional token (like class) we define a custom `mask_length`
+ if hasattr(self.model_tester, "mask_length"):
+ mask_length = self.model_tester.mask_length - dummy_mask.size(0)
+ else:
+ mask_length = self.model_tester.seq_length - dummy_mask.size(0)
+ dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
+ dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
+ processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)
+
+ if "noise" in inspect.signature(model_eager.forward).parameters:
+ np.random.seed(2)
+ num_patches = int(
+ (self.model_tester.image_size // self.model_tester.patch_size) ** 2
+ )
+ noise = np.random.uniform(size=(batch_size, num_patches))
+ processed_inputs["noise"] = torch.from_numpy(noise)
+
+ # TODO: test gradients as well (& for FA2 as well!)
+ with torch.no_grad():
+ with torch.backends.cuda.sdp_kernel(
+ enable_flash=enable_kernels,
+ enable_math=True,
+ enable_mem_efficient=enable_kernels,
+ ):
+ prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
+ outputs_eager = model_eager(**prepared_inputs)
+ outputs_sdpa = model_sdpa(**prepared_inputs)
+
+ # Ignore copy
+ logits_eager = outputs_eager.audio_values
+ # Ignore copy
+ logits_sdpa = outputs_sdpa.audio_values
+
+ if torch_device in ["cpu", "cuda"]:
+ atol = atols[torch_device, enable_kernels, torch_dtype]
+ rtol = rtols[torch_device, enable_kernels, torch_dtype]
+ else:
+ atol = 1e-7
+ rtol = 1e-4
+
+ # Masked tokens output slightly deviates - we don't mind that.
+ if use_mask:
+ if padding_side == "left":
+ sub_sdpa = logits_sdpa[:-1]
+ sub_eager = logits_eager[:-1]
+ if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ fail_cases.append(
+ get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+ )
+
+ sub_sdpa = logits_sdpa[-1, :-4]
+ sub_eager = logits_eager[-1, :-4]
+ if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ fail_cases.append(
+ get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+ )
+
+ # Testing the padding tokens is not really meaningful but anyway
+ # sub_sdpa = logits_sdpa[-1, -4:]
+ # sub_eager = logits_eager[-1, -4:]
+ # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+ elif padding_side == "right":
+ sub_sdpa = logits_sdpa[:-1]
+ sub_eager = logits_eager[:-1]
+ if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ fail_cases.append(
+ get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+ )
+
+ sub_sdpa = logits_sdpa[-1, 3:]
+ sub_eager = logits_eager[-1, 3:]
+ if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ fail_cases.append(
+ get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+ )
+
+ # Testing the padding tokens is not really meaningful but anyway
+ # sub_sdpa = logits_sdpa[-1, :3]
+ # sub_eager = logits_eager[-1, :3]
+ # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+ # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+
+ else:
+ if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
+ fail_cases.append(
+ get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+ )
+
+ self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
+
+ @require_flash_attn
+ @require_torch_gpu
+ @mark.flash_attn_test
+ @slow
+ @is_flaky()
+ def test_flash_attn_2_inference_equivalence(self):
+ for model_class in self.all_model_classes:
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ model = model_class(config)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_pretrained(tmpdirname)
+ model_fa = model_class.from_pretrained(
+ tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+ )
+ model_fa.to(torch_device)
+
+ model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
+ model.to(torch_device)
+
+ dummy_input = inputs_dict[model.main_input_name][:1]
+ if dummy_input.dtype in [torch.float32, torch.float16]:
+ dummy_input = dummy_input.to(torch.bfloat16)
+
+ outputs = model(dummy_input)
+ outputs_fa = model_fa(dummy_input)
+
+ logits = outputs[1]
+ logits_fa = outputs_fa[1]
+
+ assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)
+
+ @unittest.skip(reason="The MimiModel does not support right padding")
+ def test_flash_attn_2_inference_equivalence_right_padding(self):
+ pass
+
+ @unittest.skip(reason="The MimiModel does not have support dynamic compile yet")
+ def test_sdpa_can_compile_dynamic(self):
+ pass
+
+ # For now, Let's focus only on GPU for `torch.compile`
+ @slow
+ @require_torch_gpu
+ def test_torch_compile(self):
+ if version.parse(torch.__version__) < version.parse("2.3"):
+ self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ n_iter = 3
+ for model_class in self.all_model_classes:
+ model = model_class(config).to(torch_device)
+ model.forward = torch.compile(model.forward)
+ for i in range(n_iter):
+ _ = model(inputs_dict["input_values"].to(torch_device))
+
+ @is_flaky()
+ def test_batching_equivalence(self):
+ super().test_batching_equivalence()
+
+
+# Copied from transformers.tests.encodec.test_modeling_encodec.normalize
+def normalize(arr):
+ norm = np.linalg.norm(arr)
+ normalized_arr = arr / norm
+ return normalized_arr
+
+
+# Copied from transformers.tests.encodec.test_modeling_encodec.compute_rmse
+def compute_rmse(arr1, arr2):
+ arr1_normalized = normalize(arr1)
+ arr2_normalized = normalize(arr2)
+ return np.sqrt(((arr1_normalized - arr2_normalized) ** 2).mean())
+
+
+@slow
+@require_torch
+class MimiIntegrationTest(unittest.TestCase):
+ def test_integration_using_cache_decode(self):
+ expected_rmse = {
+ "8": 0.0018785292,
+ "32": 0.0012330565,
+ }
+
+ librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ model_id = "kyutai/mimi"
+
+ model = MimiModel.from_pretrained(model_id, use_cache=True).to(torch_device)
+ processor = AutoFeatureExtractor.from_pretrained(model_id)
+
+ librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+ audio_sample = librispeech_dummy[-1]["audio"]["array"]
+
+ inputs = processor(
+ raw_audio=audio_sample,
+ sampling_rate=processor.sampling_rate,
+ return_tensors="pt",
+ ).to(torch_device)
+
+ for num_codebooks, expected_rmse in expected_rmse.items():
+ with torch.no_grad():
+ # use max bandwith for best possible reconstruction
+ encoder_outputs = model.encode(inputs["input_values"], num_quantizers=int(num_codebooks))
+
+ audio_codes = encoder_outputs[0]
+
+ decoder_outputs_first_part = model.decode(audio_codes[:, :, : audio_codes.shape[2] // 2])
+ decoder_outputs_second_part = model.decode(
+ audio_codes[:, :, audio_codes.shape[2] // 2 :],
+ decoder_past_key_values=decoder_outputs_first_part.decoder_past_key_values,
+ )
+
+ audio_output_entire_context = model.decode(audio_codes)[0]
+ audio_output_concat_context = torch.cat(
+ [decoder_outputs_first_part[0], decoder_outputs_second_part[0]], dim=2
+ )
+
+ # make sure audios are more or less equal
+ # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+ rmse = compute_rmse(
+ audio_output_concat_context.squeeze().cpu().numpy(),
+ audio_output_entire_context.squeeze().cpu().numpy(),
+ )
+ self.assertTrue(rmse < 1e-3)
+
+ def test_integration(self):
+ expected_rmses = {
+ "8": 0.0018785292,
+ "32": 0.0012330565,
+ }
+ expected_codesums = {
+ "8": 430423,
+ "32": 1803071,
+ }
+ librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+ model_id = "kyutai/mimi"
+
+ processor = AutoFeatureExtractor.from_pretrained(model_id)
+
+ librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+ audio_sample = librispeech_dummy[-1]["audio"]["array"]
+
+ inputs = processor(
+ raw_audio=audio_sample,
+ sampling_rate=processor.sampling_rate,
+ return_tensors="pt",
+ ).to(torch_device)
+
+ for use_cache in [False, True]:
+ model = MimiModel.from_pretrained(model_id, use_cache=use_cache).to(torch_device)
+ for num_codebooks, expected_rmse in expected_rmses.items():
+ with torch.no_grad():
+ # use max bandwith for best possible reconstruction
+ encoder_outputs = model.encode(inputs["input_values"], num_quantizers=int(num_codebooks))
+
+ audio_code_sums = encoder_outputs[0].sum().cpu().item()
+
+ # make sure audio encoded codes are correct
+ # assert relative difference less than a threshold, because `audio_code_sums` varies a bit
+ # depending on torch version
+ self.assertTrue(
+ np.abs(audio_code_sums - expected_codesums[num_codebooks]) <= (3e-3 * audio_code_sums)
+ )
+
+ input_values_dec = model.decode(encoder_outputs[0], padding_mask=inputs["padding_mask"])[0]
+ input_values_enc_dec = model(
+ inputs["input_values"], inputs["padding_mask"], num_quantizers=int(num_codebooks)
+ )[1]
+
+ # make sure forward and decode gives same result
+ self.assertTrue(torch.allclose(input_values_dec, input_values_enc_dec))
+
+ # make sure shape matches
+ self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape)
+
+ arr = inputs["input_values"][0].cpu().numpy()
+ arr_enc_dec = input_values_enc_dec[0].cpu().numpy()
+
+ # make sure audios are more or less equal
+ # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+ rmse = compute_rmse(arr, arr_enc_dec)
+ self.assertTrue(np.abs(rmse - expected_rmse) < 1e-5)
From e40bb4845e0eefb52ec1e9cac9c2446ab36aef81 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay
Date: Thu, 19 Sep 2024 09:56:52 +0200
Subject: [PATCH 46/67] Load and save video-processor from separate folder
(#33562)
* load and save from video-processor folder
* Update src/transformers/models/llava_onevision/processing_llava_onevision.py
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---------
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
.../image_processing_llava_onevision.py | 1 +
.../processing_llava_onevision.py | 53 ++++++++++++++++++-
.../test_processing_llava_onevision.py | 21 ++++----
tests/test_processing_common.py | 8 +++
4 files changed, 71 insertions(+), 12 deletions(-)
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
index 3dddcdd148a416..2047557208372a 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -621,6 +621,7 @@ def preprocess(
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
+ size = get_size_dict(size, default_to_square=False)
image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index e050ec3f31deea..d4ae02e0bb154c 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -17,6 +17,7 @@
"""
import math
+import os
import sys
from typing import Iterable, List, Union
@@ -34,6 +35,11 @@
ProcessorMixin,
)
from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+from ..auto import AutoImageProcessor
+
+
+logger = logging.get_logger(__name__)
class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
@@ -96,7 +102,7 @@ def __init__(
chat_template=None,
image_token="",
video_token="