From 4b0418df11886547e2c701cc4504627881397a0b Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Fri, 13 Sep 2024 12:58:38 +0200
Subject: [PATCH 01/67] Enable `padding_side` as call time kwargs (#33385)

* fix

* add padding-side kwarg

* add padding side in all models & fix tests

* fix copies

* fix tests
---
 .../layoutlmv2/tokenization_layoutlmv2.py     | 29 +++++++++--
 .../tokenization_layoutlmv2_fast.py           | 22 +++++++--
 .../layoutlmv3/tokenization_layoutlmv3.py     | 29 +++++++++--
 .../tokenization_layoutlmv3_fast.py           | 22 +++++++--
 .../layoutxlm/tokenization_layoutxlm.py       | 23 +++++++--
 .../layoutxlm/tokenization_layoutxlm_fast.py  | 18 +++++--
 .../models/led/tokenization_led.py            |  2 +
 .../models/led/tokenization_led_fast.py       |  2 +
 .../models/luke/tokenization_luke.py          | 29 +++++++++--
 .../models/markuplm/tokenization_markuplm.py  | 29 +++++++++--
 .../markuplm/tokenization_markuplm_fast.py    | 22 +++++++--
 .../models/mluke/tokenization_mluke.py        | 29 +++++++++--
 .../models/roc_bert/tokenization_roc_bert.py  | 17 +++++--
 .../models/tapas/tokenization_tapas.py        | 27 ++++++++--
 .../models/udop/tokenization_udop.py          | 27 ++++++++--
 .../models/udop/tokenization_udop_fast.py     | 22 +++++++--
 .../models/wav2vec2/tokenization_wav2vec2.py  |  6 +++
 src/transformers/tokenization_utils.py        |  7 +++
 src/transformers/tokenization_utils_base.py   | 37 ++++++++++++--
 src/transformers/tokenization_utils_fast.py   | 10 +++-
 .../test_tokenization_layoutlmv2.py           | 44 ++++++++++-------
 .../test_tokenization_layoutlmv3.py           | 44 ++++++++++-------
 .../layoutxlm/test_tokenization_layoutxlm.py  | 44 ++++++++++-------
 .../markuplm/test_tokenization_markuplm.py    | 44 ++++++++++-------
 tests/models/tapas/test_tokenization_tapas.py | 43 +++++++++-------
 tests/test_tokenization_common.py             | 49 ++++++++++++-------
 26 files changed, 528 insertions(+), 149 deletions(-)

diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index fe0305562374..c5ec79666dee 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -414,6 +414,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -517,6 +518,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -539,6 +541,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -567,6 +570,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -598,6 +602,7 @@ def batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -625,6 +630,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -653,6 +659,7 @@ def _batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
             return_overflowing_tokens=return_overflowing_tokens,
@@ -677,6 +684,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -708,6 +716,7 @@ def _batch_prepare_for_model(
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
                 return_attention_mask=False,  # we pad in batch afterward
                 return_token_type_ids=return_token_type_ids,
                 return_overflowing_tokens=return_overflowing_tokens,
@@ -728,6 +737,7 @@ def _batch_prepare_for_model(
             padding=padding_strategy.value,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
@@ -748,6 +758,7 @@ def encode(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -769,6 +780,7 @@ def encode(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -795,6 +807,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -838,6 +851,7 @@ def encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -861,6 +875,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -891,6 +906,7 @@ def _encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -914,6 +930,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1100,6 +1117,7 @@ def prepare_for_model(
                 max_length=max_length,
                 padding=padding_strategy.value,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -1243,6 +1261,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -1265,6 +1284,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -1288,7 +1310,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -1302,7 +1325,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -1317,7 +1340,7 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
 
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
index aa2bf6b3226b..a666e3d4ea1a 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
@@ -165,6 +165,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -268,6 +269,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -290,6 +292,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -318,6 +321,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -349,6 +353,7 @@ def batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -381,6 +386,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -424,6 +430,7 @@ def encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -451,6 +458,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -470,6 +478,7 @@ def _batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
         )
 
         if is_pair:
@@ -603,6 +612,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -631,6 +641,7 @@ def _encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -663,6 +674,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -685,6 +697,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -708,7 +723,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -722,7 +738,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -737,7 +753,7 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
 
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
index 89f899f22f4e..248a299c141f 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -543,6 +543,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -646,6 +647,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -668,6 +670,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -697,6 +700,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -728,6 +732,7 @@ def batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -756,6 +761,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -784,6 +790,7 @@ def _batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
             return_overflowing_tokens=return_overflowing_tokens,
@@ -809,6 +816,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -840,6 +848,7 @@ def _batch_prepare_for_model(
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
                 return_attention_mask=False,  # we pad in batch afterward
                 return_token_type_ids=return_token_type_ids,
                 return_overflowing_tokens=return_overflowing_tokens,
@@ -860,6 +869,7 @@ def _batch_prepare_for_model(
             padding=padding_strategy.value,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
@@ -881,6 +891,7 @@ def encode(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -902,6 +913,7 @@ def encode(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -929,6 +941,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -972,6 +985,7 @@ def encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -996,6 +1010,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1026,6 +1041,7 @@ def _encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -1049,6 +1065,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1237,6 +1254,7 @@ def prepare_for_model(
                 max_length=max_length,
                 padding=padding_strategy.value,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -1382,6 +1400,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -1404,6 +1423,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -1427,7 +1449,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -1441,7 +1464,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -1456,6 +1479,6 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
index 07bedf36133a..63cd1022e521 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -217,6 +217,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -320,6 +321,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -342,6 +344,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -371,6 +374,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -402,6 +406,7 @@ def batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -436,6 +441,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -479,6 +485,7 @@ def encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -506,6 +513,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -525,6 +533,7 @@ def _batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
         )
 
         if is_pair:
@@ -664,6 +673,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -692,6 +702,7 @@ def _encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -725,6 +736,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -747,6 +759,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -770,7 +785,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -784,7 +800,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -799,7 +815,7 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
 
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index 3ab57ac892aa..248f16af8441 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -447,6 +447,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -550,6 +551,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -572,6 +574,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -599,6 +602,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -627,6 +631,7 @@ def _batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
             return_overflowing_tokens=return_overflowing_tokens,
@@ -651,6 +656,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -682,6 +688,7 @@ def _batch_prepare_for_model(
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
                 return_attention_mask=False,  # we pad in batch afterward
                 return_token_type_ids=return_token_type_ids,
                 return_overflowing_tokens=return_overflowing_tokens,
@@ -702,6 +709,7 @@ def _batch_prepare_for_model(
             padding=padding_strategy.value,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
@@ -721,6 +729,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -751,6 +760,7 @@ def _encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -774,6 +784,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -947,6 +958,7 @@ def prepare_for_model(
                 max_length=max_length,
                 padding=padding_strategy.value,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -1090,6 +1102,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -1112,6 +1125,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -1135,7 +1151,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -1149,7 +1166,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -1164,6 +1181,6 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index 6d68cb9f18e7..7d12cec496ea 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -277,6 +277,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -380,6 +381,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -402,6 +404,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -442,6 +445,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -462,6 +466,7 @@ def _batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
         )
 
         if is_pair:
@@ -595,6 +600,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -623,6 +629,7 @@ def _encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -655,6 +662,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -677,6 +685,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -700,7 +711,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -714,7 +726,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -729,7 +741,7 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
 
diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py
index aaf09e6d149e..6c1ec9526aef 100644
--- a/src/transformers/models/led/tokenization_led.py
+++ b/src/transformers/models/led/tokenization_led.py
@@ -412,6 +412,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         encoded_inputs = super()._pad(
@@ -419,6 +420,7 @@ def _pad(
             max_length=max_length,
             padding_strategy=padding_strategy,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
index ca15eb997bed..6ee69fbe7927 100644
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ b/src/transformers/models/led/tokenization_led_fast.py
@@ -288,6 +288,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         encoded_inputs = super()._pad(
@@ -295,6 +296,7 @@ def _pad(
             max_length=max_length,
             padding_strategy=padding_strategy,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 1a570992ffb4..e06b9c753fe5 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -570,6 +570,7 @@ def __call__(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -662,6 +663,7 @@ def __call__(
                 stride=stride,
                 is_split_into_words=is_split_into_words,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -688,6 +690,7 @@ def __call__(
                 stride=stride,
                 is_split_into_words=is_split_into_words,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -715,6 +718,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -769,6 +773,7 @@ def _encode_plus(
             max_entity_length=max_entity_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -796,6 +801,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -876,6 +882,7 @@ def _batch_encode_plus(
             max_entity_length=max_entity_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
             return_overflowing_tokens=return_overflowing_tokens,
@@ -1070,6 +1077,7 @@ def _batch_prepare_for_model(
         max_entity_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1112,6 +1120,7 @@ def _batch_prepare_for_model(
                 max_entity_length=max_entity_length,
                 stride=stride,
                 pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
                 return_attention_mask=False,  # we pad in batch afterward
                 return_token_type_ids=return_token_type_ids,
                 return_overflowing_tokens=return_overflowing_tokens,
@@ -1132,6 +1141,7 @@ def _batch_prepare_for_model(
             padding=padding_strategy.value,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
@@ -1155,6 +1165,7 @@ def prepare_for_model(
         max_entity_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1357,6 +1368,7 @@ def prepare_for_model(
                 max_entity_length=max_entity_length,
                 padding=padding_strategy.value,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -1382,6 +1394,7 @@ def pad(
         max_length: Optional[int] = None,
         max_entity_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         verbose: bool = True,
@@ -1418,6 +1431,9 @@ def pad(
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention
@@ -1495,6 +1511,7 @@ def pad(
                 max_entity_length=max_entity_length,
                 padding_strategy=padding_strategy,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
             return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
@@ -1519,6 +1536,7 @@ def pad(
                 max_entity_length=max_entity_length,
                 padding_strategy=padding_strategy,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -1536,6 +1554,7 @@ def _pad(
         max_entity_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -1562,6 +1581,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -1600,9 +1622,10 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(encoded_inputs["input_ids"])
+            padding_side = padding_side if padding_side is not None else self.padding_side
             if entities_provided:
                 entity_difference = max_entity_length - len(encoded_inputs["entity_ids"])
-            if self.padding_side == "right":
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                     if entities_provided:
@@ -1633,7 +1656,7 @@ def _pad(
                             encoded_inputs["entity_end_positions"] + [0] * entity_difference
                         )
 
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                     if entities_provided:
@@ -1664,7 +1687,7 @@ def _pad(
                             "entity_end_positions"
                         ]
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
 
diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py
index c77865abc934..e5de1e4e765c 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm.py
@@ -503,6 +503,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -602,6 +603,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -624,6 +626,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -652,6 +655,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -683,6 +687,7 @@ def batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -710,6 +715,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -738,6 +744,7 @@ def _batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
             return_overflowing_tokens=return_overflowing_tokens,
@@ -762,6 +769,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -793,6 +801,7 @@ def _batch_prepare_for_model(
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
                 return_attention_mask=False,  # we pad in batch afterward
                 return_token_type_ids=return_token_type_ids,
                 return_overflowing_tokens=return_overflowing_tokens,
@@ -813,6 +822,7 @@ def _batch_prepare_for_model(
             padding=padding_strategy.value,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
@@ -833,6 +843,7 @@ def encode(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -854,6 +865,7 @@ def encode(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -880,6 +892,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -923,6 +936,7 @@ def encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -946,6 +960,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -976,6 +991,7 @@ def _encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -999,6 +1015,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1203,6 +1220,7 @@ def prepare_for_model(
                 max_length=max_length,
                 padding=padding_strategy.value,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -1357,6 +1375,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -1376,6 +1395,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -1399,7 +1421,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -1419,7 +1442,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -1440,6 +1463,6 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
index ff0e4ffeb56e..796459876425 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
@@ -286,6 +286,7 @@ def __call__(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -385,6 +386,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -407,6 +409,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -435,6 +438,7 @@ def batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -466,6 +470,7 @@ def batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -498,6 +503,7 @@ def encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -541,6 +547,7 @@ def encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -568,6 +575,7 @@ def _batch_encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -587,6 +595,7 @@ def _batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
         )
 
         if is_pair:
@@ -721,6 +730,7 @@ def _encode_plus(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -749,6 +759,7 @@ def _encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -781,6 +792,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -800,6 +812,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -823,7 +838,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -843,7 +859,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -864,7 +880,7 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
 
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index 3ac8191402af..f087c0d92fc6 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -399,6 +399,7 @@ def __call__(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -491,6 +492,7 @@ def __call__(
                 stride=stride,
                 is_split_into_words=is_split_into_words,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -517,6 +519,7 @@ def __call__(
                 stride=stride,
                 is_split_into_words=is_split_into_words,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -545,6 +548,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -599,6 +603,7 @@ def _encode_plus(
             max_entity_length=max_entity_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -627,6 +632,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: Optional[bool] = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -707,6 +713,7 @@ def _batch_encode_plus(
             max_entity_length=max_entity_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
             return_overflowing_tokens=return_overflowing_tokens,
@@ -904,6 +911,7 @@ def _batch_prepare_for_model(
         max_entity_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -946,6 +954,7 @@ def _batch_prepare_for_model(
                 max_entity_length=max_entity_length,
                 stride=stride,
                 pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
                 return_attention_mask=False,  # we pad in batch afterward
                 return_token_type_ids=return_token_type_ids,
                 return_overflowing_tokens=return_overflowing_tokens,
@@ -966,6 +975,7 @@ def _batch_prepare_for_model(
             padding=padding_strategy.value,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
@@ -990,6 +1000,7 @@ def prepare_for_model(
         max_entity_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1192,6 +1203,7 @@ def prepare_for_model(
                 max_entity_length=max_entity_length,
                 padding=padding_strategy.value,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -1218,6 +1230,7 @@ def pad(
         max_length: Optional[int] = None,
         max_entity_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         verbose: bool = True,
@@ -1254,6 +1267,9 @@ def pad(
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are attention
@@ -1331,6 +1347,7 @@ def pad(
                 max_entity_length=max_entity_length,
                 padding_strategy=padding_strategy,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
             return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
@@ -1355,6 +1372,7 @@ def pad(
                 max_entity_length=max_entity_length,
                 padding_strategy=padding_strategy,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -1373,6 +1391,7 @@ def _pad(
         max_entity_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -1399,6 +1418,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -1437,9 +1459,10 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(encoded_inputs["input_ids"])
+            padding_side = padding_side if padding_side is not None else self.padding_side
             if entities_provided:
                 entity_difference = max_entity_length - len(encoded_inputs["entity_ids"])
-            if self.padding_side == "right":
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                     if entities_provided:
@@ -1470,7 +1493,7 @@ def _pad(
                             encoded_inputs["entity_end_positions"] + [0] * entity_difference
                         )
 
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                     if entities_provided:
@@ -1501,7 +1524,7 @@ def _pad(
                             "entity_end_positions"
                         ]
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
 
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index eaf2a1a49133..3a980c0ae66f 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -210,6 +210,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -283,6 +284,7 @@ def get_input_ids(text):
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -308,6 +310,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -462,6 +465,7 @@ def prepare_for_model(
                 max_length=max_length,
                 padding=padding_strategy.value,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -480,6 +484,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         # Load from model defaults
@@ -502,8 +507,9 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
+            padding_side = padding_side if padding_side is not None else self.padding_side
 
-            if self.padding_side == "right":
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -516,7 +522,7 @@ def _pad(
                     if key in encoded_inputs:
                         encoded_inputs[key] = encoded_inputs[key] + [self.pad_token_id] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -530,7 +536,7 @@ def _pad(
                         encoded_inputs[key] = [self.pad_token_id] * difference + encoded_inputs[key]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
 
@@ -551,6 +557,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -627,6 +634,7 @@ def get_input_ids(text):
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
             return_overflowing_tokens=return_overflowing_tokens,
@@ -650,6 +658,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -686,6 +695,7 @@ def _batch_prepare_for_model(
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
                 return_attention_mask=False,  # we pad in batch afterward
                 return_token_type_ids=return_token_type_ids,
                 return_overflowing_tokens=return_overflowing_tokens,
@@ -706,6 +716,7 @@ def _batch_prepare_for_model(
             padding=padding_strategy.value,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 2da9fe40c1ce..867e53ff8907 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -517,6 +517,7 @@ def __call__(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -581,6 +582,7 @@ def __call__(
                 truncation=truncation,
                 max_length=max_length,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -602,6 +604,7 @@ def __call__(
                 truncation=truncation,
                 max_length=max_length,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -631,6 +634,7 @@ def batch_encode_plus(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -699,6 +703,7 @@ def batch_encode_plus(
             truncation=truncation,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -738,6 +743,7 @@ def _batch_encode_plus(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = True,
         return_attention_mask: Optional[bool] = None,
@@ -768,6 +774,7 @@ def _batch_encode_plus(
             add_special_tokens=add_special_tokens,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -797,6 +804,7 @@ def _batch_prepare_for_model(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = True,
         return_attention_mask: Optional[bool] = True,
@@ -823,6 +831,7 @@ def _batch_prepare_for_model(
                 truncation=truncation,
                 max_length=max_length,
                 pad_to_multiple_of=None,  # we pad in batch afterwards
+                padding_side=None,  # we pad in batch afterward
                 return_attention_mask=False,  # we pad in batch afterwards
                 return_token_type_ids=return_token_type_ids,
                 return_special_tokens_mask=return_special_tokens_mask,
@@ -844,6 +853,7 @@ def _batch_prepare_for_model(
             padding=padding,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
@@ -912,6 +922,7 @@ def encode_plus(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -968,6 +979,7 @@ def encode_plus(
             padding=padding,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -993,6 +1005,7 @@ def _encode_plus(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = True,
         return_attention_mask: Optional[bool] = True,
@@ -1024,6 +1037,7 @@ def _encode_plus(
             padding=padding,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -1051,6 +1065,7 @@ def prepare_for_model(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = True,
         return_attention_mask: Optional[bool] = True,
@@ -1214,6 +1229,7 @@ def prepare_for_model(
                 max_length=max_length,
                 padding=padding.value,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -1754,6 +1770,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -1776,6 +1793,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -1799,7 +1819,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(encoded_inputs["input_ids"])
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -1817,7 +1838,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -1836,7 +1857,7 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
 
diff --git a/src/transformers/models/udop/tokenization_udop.py b/src/transformers/models/udop/tokenization_udop.py
index 4be979981916..e40c07a58ace 100644
--- a/src/transformers/models/udop/tokenization_udop.py
+++ b/src/transformers/models/udop/tokenization_udop.py
@@ -551,6 +551,7 @@ def call_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -654,6 +655,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -676,6 +678,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -704,6 +707,7 @@ def batch_encode_plus_boxes(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -746,6 +750,7 @@ def batch_encode_plus_boxes(
             stride=stride,
             is_split_into_words=is_split_into_words,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -813,6 +818,7 @@ def encode_plus_boxes(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -865,6 +871,7 @@ def encode_plus_boxes(
             stride=stride,
             is_split_into_words=is_split_into_words,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -892,6 +899,7 @@ def _batch_encode_plus_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -920,6 +928,7 @@ def _batch_encode_plus_boxes(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
             return_overflowing_tokens=return_overflowing_tokens,
@@ -944,6 +953,7 @@ def _batch_prepare_for_model_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -975,6 +985,7 @@ def _batch_prepare_for_model_boxes(
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
                 return_attention_mask=False,  # we pad in batch afterward
                 return_token_type_ids=return_token_type_ids,
                 return_overflowing_tokens=return_overflowing_tokens,
@@ -995,6 +1006,7 @@ def _batch_prepare_for_model_boxes(
             padding=padding_strategy.value,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
@@ -1014,6 +1026,7 @@ def _encode_plus_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1044,6 +1057,7 @@ def _encode_plus_boxes(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -1067,6 +1081,7 @@ def prepare_for_model_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1240,6 +1255,7 @@ def prepare_for_model_boxes(
                 max_length=max_length,
                 padding=padding_strategy.value,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -1385,6 +1401,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -1407,6 +1424,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -1430,7 +1450,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -1444,7 +1465,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -1459,6 +1480,6 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
diff --git a/src/transformers/models/udop/tokenization_udop_fast.py b/src/transformers/models/udop/tokenization_udop_fast.py
index 8340c4af4e2b..8ee0577fa10e 100644
--- a/src/transformers/models/udop/tokenization_udop_fast.py
+++ b/src/transformers/models/udop/tokenization_udop_fast.py
@@ -286,6 +286,7 @@ def call_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -389,6 +390,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -411,6 +413,7 @@ def _is_valid_text_input(t):
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -453,6 +456,7 @@ def batch_encode_plus_boxes(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -501,6 +505,7 @@ def batch_encode_plus_boxes(
             stride=stride,
             is_split_into_words=is_split_into_words,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -528,6 +533,7 @@ def _batch_encode_plus_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -548,6 +554,7 @@ def _batch_encode_plus_boxes(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
         )
 
         if is_pair:
@@ -684,6 +691,7 @@ def _encode_plus_boxes(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -712,6 +720,7 @@ def _encode_plus_boxes(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -794,6 +803,7 @@ def encode_plus_boxes(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -846,6 +856,7 @@ def encode_plus_boxes(
             stride=stride,
             is_split_into_words=is_split_into_words,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -864,6 +875,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -886,6 +898,9 @@ def _pad(
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -909,7 +924,8 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
-            if self.padding_side == "right":
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -923,7 +939,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -938,7 +954,7 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
 
         return encoded_inputs
 
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index 647b18521d05..c1a333fe48c6 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -781,6 +781,7 @@ def __call__(
         padding: Union[bool, str, PaddingStrategy] = False,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         verbose: bool = True,
         **kwargs,
@@ -794,6 +795,10 @@ def __call__(
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                 values, a list of numpy array or a list of list of float values. Must be mono channel audio, not
                 stereo, i.e. single float per timestep.
+
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
         """
 
         is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
@@ -825,6 +830,7 @@ def __call__(
             padding=padding,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=self.return_attention_mask,
             return_tensors=return_tensors,
             verbose=verbose,
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index f04eaae4525d..6a5bff3679f8 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -749,6 +749,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -806,6 +807,7 @@ def get_input_ids(text):
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             prepend_batch_axis=True,
             return_attention_mask=return_attention_mask,
@@ -833,6 +835,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -891,6 +894,7 @@ def get_input_ids(text):
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
             return_overflowing_tokens=return_overflowing_tokens,
@@ -913,6 +917,7 @@ def _batch_prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -942,6 +947,7 @@ def _batch_prepare_for_model(
                 max_length=max_length,
                 stride=stride,
                 pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
                 return_attention_mask=False,  # we pad in batch afterward
                 return_token_type_ids=return_token_type_ids,
                 return_overflowing_tokens=return_overflowing_tokens,
@@ -963,6 +969,7 @@ def _batch_prepare_for_model(
             padding=padding_strategy.value,
             max_length=max_length,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_attention_mask=return_attention_mask,
         )
 
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 5e9170456a07..93dea5ba09de 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1427,6 +1427,9 @@ def all_special_ids(self) -> List[int]:
                 If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
@@ -2767,6 +2770,7 @@ def encode(
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_length: Optional[int] = None,
         stride: int = 0,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> List[int]:
@@ -2793,6 +2797,7 @@ def encode(
             truncation=truncation,
             max_length=max_length,
             stride=stride,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             **kwargs,
         )
@@ -2956,6 +2961,7 @@ def __call__(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -2997,6 +3003,7 @@ def __call__(
             "stride": stride,
             "is_split_into_words": is_split_into_words,
             "pad_to_multiple_of": pad_to_multiple_of,
+            "padding_side": padding_side,
             "return_tensors": return_tensors,
             "return_token_type_ids": return_token_type_ids,
             "return_attention_mask": return_attention_mask,
@@ -3041,6 +3048,7 @@ def _call_one(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3111,6 +3119,7 @@ def _is_valid_text_input(t):
                 stride=stride,
                 is_split_into_words=is_split_into_words,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -3133,6 +3142,7 @@ def _is_valid_text_input(t):
                 stride=stride,
                 is_split_into_words=is_split_into_words,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_tensors=return_tensors,
                 return_token_type_ids=return_token_type_ids,
                 return_attention_mask=return_attention_mask,
@@ -3157,6 +3167,7 @@ def encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3207,6 +3218,7 @@ def encode_plus(
             stride=stride,
             is_split_into_words=is_split_into_words,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -3230,6 +3242,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3261,6 +3274,7 @@ def batch_encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3307,6 +3321,7 @@ def batch_encode_plus(
             stride=stride,
             is_split_into_words=is_split_into_words,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
@@ -3336,6 +3351,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3361,6 +3377,7 @@ def pad(
         padding: Union[bool, str, PaddingStrategy] = True,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         verbose: bool = True,
@@ -3409,6 +3426,9 @@ def pad(
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific tokenizer's default, defined by the `return_outputs` attribute.
@@ -3491,6 +3511,7 @@ def pad(
                 max_length=max_length,
                 padding_strategy=padding_strategy,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
             return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
@@ -3512,6 +3533,7 @@ def pad(
                 max_length=max_length,
                 padding_strategy=padding_strategy,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -3573,6 +3595,7 @@ def prepare_for_model(
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -3686,6 +3709,7 @@ def prepare_for_model(
                 max_length=max_length,
                 padding=padding_strategy.value,
                 pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
                 return_attention_mask=return_attention_mask,
             )
 
@@ -3828,6 +3852,7 @@ def _pad(
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """
@@ -3843,13 +3868,16 @@ def _pad(
                 - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                 - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                 - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in self.padding_side:
+                The tokenizer padding sides are defined in `padding_side` argument:
 
                     - 'left': pads on the left of the sequences
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
                 `>= 7.5` (Volta).
+            padding_side:
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
             return_attention_mask:
                 (optional) Set to False to avoid returning attention mask (default: set to model specifics)
         """
@@ -3873,8 +3901,9 @@ def _pad(
 
         if needs_to_be_padded:
             difference = max_length - len(required_input)
+            padding_side = padding_side if padding_side is not None else self.padding_side
 
-            if self.padding_side == "right":
+            if padding_side == "right":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
@@ -3884,7 +3913,7 @@ def _pad(
                 if "special_tokens_mask" in encoded_inputs:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
+            elif padding_side == "left":
                 if return_attention_mask:
                     encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
                 if "token_type_ids" in encoded_inputs:
@@ -3895,7 +3924,7 @@ def _pad(
                     encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                 encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
             else:
-                raise ValueError(f"Invalid padding strategy:{self.padding_side}")
+                raise ValueError(f"Invalid padding strategy:{padding_side}")
 
         return encoded_inputs
 
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 7d5446d7cbf2..724484b3b30b 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -429,6 +429,7 @@ def set_truncation_and_padding(
         max_length: int,
         stride: int,
         pad_to_multiple_of: Optional[int],
+        padding_side: Optional[bool],
     ):
         """
         Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
@@ -450,6 +451,9 @@ def set_truncation_and_padding(
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
         """
         _truncation = self._tokenizer.truncation
         _padding = self._tokenizer.padding
@@ -484,7 +488,7 @@ def set_truncation_and_padding(
             length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
             target = {
                 "length": length,
-                "direction": self.padding_side,
+                "direction": padding_side if padding_side is not None else self.padding_side,
                 "pad_id": self.pad_token_id,
                 "pad_token": self.pad_token,
                 "pad_type_id": self.pad_token_type_id,
@@ -505,6 +509,7 @@ def _batch_encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -527,6 +532,7 @@ def _batch_encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
         )
 
         if self._tokenizer.encode_special_tokens != split_special_tokens:
@@ -593,6 +599,7 @@ def _encode_plus(
         stride: int = 0,
         is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[bool] = None,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -614,6 +621,7 @@ def _encode_plus(
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
             return_tensors=return_tensors,
             return_token_type_ids=return_token_type_ids,
             return_attention_mask=return_attention_mask,
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index bb526e140e57..19a6aeec46f9 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -21,6 +21,8 @@
 import unittest
 from typing import List
 
+from parameterized import parameterized
+
 from transformers import (
     AddedToken,
     LayoutLMv2TokenizerFast,
@@ -393,7 +395,8 @@ def test_right_and_left_truncation(self):
     def test_split_special_tokens(self):
         pass
 
-    def test_encode_plus_with_padding(self):
+    @parameterized.expand([(True,), (False,)])
+    def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -444,15 +447,18 @@ def test_encode_plus_with_padding(self):
                 self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
 
                 # Test right padding
-                tokenizer.padding_side = "right"
+                tokenizer_kwargs_right = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "right"
+                else:
+                    tokenizer_kwargs_right["padding_side"] = "right"
 
-                right_padded_sequence = tokenizer.encode_plus(
-                    words,
-                    boxes=boxes,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
                 right_padded_input_ids = right_padded_sequence["input_ids"]
 
                 right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -463,14 +469,18 @@ def test_encode_plus_with_padding(self):
                 self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
 
                 # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    words,
-                    boxes=boxes,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                tokenizer_kwargs_left = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "left"
+                else:
+                    tokenizer_kwargs_left["padding_side"] = "left"
+
+                left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
                 left_padded_input_ids = left_padded_sequence["input_ids"]
                 left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
                 left_padded_sequence_length = len(left_padded_input_ids)
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 5ea384f0b264..007e23430b3a 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -22,6 +22,8 @@
 import unittest
 from typing import List
 
+from parameterized import parameterized
+
 from transformers import (
     AddedToken,
     LayoutLMv3TokenizerFast,
@@ -273,7 +275,8 @@ def test_right_and_left_truncation(self):
     def test_split_special_tokens(self):
         pass
 
-    def test_encode_plus_with_padding(self):
+    @parameterized.expand([(True,), (False,)])
+    def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -324,15 +327,18 @@ def test_encode_plus_with_padding(self):
                 self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
 
                 # Test right padding
-                tokenizer.padding_side = "right"
+                tokenizer_kwargs_right = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "right"
+                else:
+                    tokenizer_kwargs_right["padding_side"] = "right"
 
-                right_padded_sequence = tokenizer.encode_plus(
-                    words,
-                    boxes=boxes,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
                 right_padded_input_ids = right_padded_sequence["input_ids"]
 
                 right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -343,14 +349,18 @@ def test_encode_plus_with_padding(self):
                 self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
 
                 # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    words,
-                    boxes=boxes,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                tokenizer_kwargs_left = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "left"
+                else:
+                    tokenizer_kwargs_left["padding_side"] = "left"
+
+                left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
                 left_padded_input_ids = left_padded_sequence["input_ids"]
                 left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
                 left_padded_sequence_length = len(left_padded_input_ids)
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index c0e44fcb3049..8acd3716cf57 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -19,6 +19,8 @@
 import unittest
 from typing import List
 
+from parameterized import parameterized
+
 from transformers import (
     AddedToken,
     LayoutXLMTokenizerFast,
@@ -324,7 +326,8 @@ def test_encode_decode_with_spaces(self):
                 decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
                 self.assertIn(decoded, [output, output.lower()])
 
-    def test_encode_plus_with_padding(self):
+    @parameterized.expand([(True,), (False,)])
+    def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -375,15 +378,18 @@ def test_encode_plus_with_padding(self):
                 self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
 
                 # Test right padding
-                tokenizer.padding_side = "right"
+                tokenizer_kwargs_right = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "right"
+                else:
+                    tokenizer_kwargs_right["padding_side"] = "right"
 
-                right_padded_sequence = tokenizer.encode_plus(
-                    words,
-                    boxes=boxes,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
                 right_padded_input_ids = right_padded_sequence["input_ids"]
 
                 right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -394,14 +400,18 @@ def test_encode_plus_with_padding(self):
                 self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
 
                 # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    words,
-                    boxes=boxes,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                tokenizer_kwargs_left = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "left"
+                else:
+                    tokenizer_kwargs_left["padding_side"] = "left"
+
+                left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
                 left_padded_input_ids = left_padded_sequence["input_ids"]
                 left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
                 left_padded_sequence_length = len(left_padded_input_ids)
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index 458df94ec2fb..fcdde2eb8a87 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -22,6 +22,8 @@
 import unittest
 from typing import List
 
+from parameterized import parameterized
+
 from transformers import (
     AddedToken,
     MarkupLMTokenizerFast,
@@ -211,7 +213,8 @@ def test_encode_decode_with_spaces(self):
     def test_right_and_left_truncation(self):
         pass
 
-    def test_encode_plus_with_padding(self):
+    @parameterized.expand([(True,), (False,)])
+    def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -262,15 +265,18 @@ def test_encode_plus_with_padding(self):
                 self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
 
                 # Test right padding
-                tokenizer.padding_side = "right"
+                tokenizer_kwargs_right = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "right"
+                else:
+                    tokenizer_kwargs_right["padding_side"] = "right"
 
-                right_padded_sequence = tokenizer.encode_plus(
-                    nodes,
-                    xpaths=xpaths,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                right_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_right)
                 right_padded_input_ids = right_padded_sequence["input_ids"]
 
                 right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -281,14 +287,18 @@ def test_encode_plus_with_padding(self):
                 self.assertTrue(special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask)
 
                 # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    nodes,
-                    xpaths=xpaths,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                tokenizer_kwargs_left = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "left"
+                else:
+                    tokenizer_kwargs_left["padding_side"] = "left"
+
+                left_padded_sequence = tokenizer.encode_plus(nodes, xpaths=xpaths, **tokenizer_kwargs_left)
                 left_padded_input_ids = left_padded_sequence["input_ids"]
                 left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
                 left_padded_sequence_length = len(left_padded_input_ids)
diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index a9b8e9a0c77f..49327a39cd80 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -21,6 +21,7 @@
 
 import numpy as np
 import pandas as pd
+from parameterized import parameterized
 
 from transformers import AddedToken, is_torch_available
 from transformers.models.tapas.tokenization_tapas import (
@@ -494,7 +495,8 @@ def test_encode_decode_with_spaces(self):
                 decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
                 self.assertIn(decoded, [output, output.lower()])
 
-    def test_encode_plus_with_padding(self):
+    @parameterized.expand([(True,), (False,)])
+    def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -547,15 +549,18 @@ def test_encode_plus_with_padding(self):
                 assert special_tokens_mask == not_padded_special_tokens_mask
 
                 # Test right padding
-                tokenizer.padding_side = "right"
+                tokenizer_kwargs_right = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "right"
+                else:
+                    tokenizer_kwargs_right["padding_side"] = "right"
 
-                right_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                right_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_right)
                 right_padded_input_ids = right_padded_sequence["input_ids"]
 
                 right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -566,14 +571,18 @@ def test_encode_plus_with_padding(self):
                 assert special_tokens_mask + [1] * padding_size == right_padded_special_tokens_mask
 
                 # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    table,
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                tokenizer_kwargs_left = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "left"
+                else:
+                    tokenizer_kwargs_left["padding_side"] = "left"
+
+                left_padded_sequence = tokenizer.encode_plus(table, sequence, **tokenizer_kwargs_left)
                 left_padded_input_ids = left_padded_sequence["input_ids"]
                 left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
                 left_padded_sequence_length = len(left_padded_input_ids)
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 64c860e3fc17..342254dfbdf0 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -2225,7 +2225,15 @@ def test_padding_with_attention_mask(self):
                 else:
                     self.assertListEqual(padded_features["attention_mask"], [[1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 0]])
 
-    def test_encode_plus_with_padding(self):
+    @parameterized.expand([(True,), (False,)])
+    def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
+        """
+        This test checks that padding works as expected when tokenizing a sequence.
+        Padding is expected to have no effect when the input is a single sequence and
+        the padding-strategy is not `max_length`. Otherwise it pads to the specified max-length
+        using tokenizer classes `padding_side` attribute. Also, we check that passing `padding_side`
+        as call time kwarg works same way as when one sets `tokenizer.padding_side` attribute.
+        """
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -2244,8 +2252,6 @@ def test_encode_plus_with_padding(self):
                 sequence_length = len(input_ids)
 
                 # Test 'longest' and 'no_padding' don't do anything
-                tokenizer.padding_side = "right"
-
                 not_padded_sequence = tokenizer.encode_plus(
                     sequence,
                     padding=True,
@@ -2275,14 +2281,18 @@ def test_encode_plus_with_padding(self):
                 self.assertEqual(special_tokens_mask, not_padded_special_tokens_mask)
 
                 # Test right padding
-                tokenizer.padding_side = "right"
+                tokenizer_kwargs_right = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
 
-                right_padded_sequence = tokenizer.encode_plus(
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "right"
+                else:
+                    tokenizer_kwargs_right["padding_side"] = "right"
+
+                right_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_right)
                 right_padded_input_ids = right_padded_sequence["input_ids"]
 
                 right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -2293,13 +2303,18 @@ def test_encode_plus_with_padding(self):
                 self.assertEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
 
                 # Test left padding
-                tokenizer.padding_side = "left"
-                left_padded_sequence = tokenizer.encode_plus(
-                    sequence,
-                    max_length=sequence_length + padding_size,
-                    padding="max_length",
-                    return_special_tokens_mask=True,
-                )
+                tokenizer_kwargs_left = {
+                    "max_length": sequence_length + padding_size,
+                    "padding": "max_length",
+                    "return_special_tokens_mask": True,
+                }
+
+                if not use_padding_as_call_kwarg:
+                    tokenizer.padding_side = "left"
+                else:
+                    tokenizer_kwargs_left["padding_side"] = "left"
+
+                left_padded_sequence = tokenizer.encode_plus(sequence, **tokenizer_kwargs_left)
                 left_padded_input_ids = left_padded_sequence["input_ids"]
                 left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
                 left_padded_sequence_length = len(left_padded_input_ids)

From 7a5659872a68ce9939c975b5727e5ac61136f256 Mon Sep 17 00:00:00 2001
From: Alvaro Moran <6949769+tengomucho@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:19:06 +0200
Subject: [PATCH 02/67] Mitigate a conflict when using sentencepiece (#33327)

* test(tokenizers): add a test showing conflict with sentencepiece

This is due to the fact that protobuf C implementation uses a global
pool for all added descriptors, so if two different files add
descriptors, they will end up conflicting.

* fix(tokenizers): mitigate sentencepiece/protobuf conflict

When sentencepiece is available, use that protobuf instead of the
internal one.

* chore(style): fix with ruff
---
 src/transformers/convert_slow_tokenizer.py    |  6 +++++-
 tests/tokenization/test_tokenization_utils.py | 20 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index f2064a131dad..eb75a46a6d9b 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -26,7 +26,7 @@
 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 
-from .utils import is_protobuf_available, logging, requires_backends
+from .utils import is_protobuf_available, is_sentencepiece_available, logging, requires_backends
 from .utils.import_utils import PROTOBUF_IMPORT_ERROR
 
 
@@ -34,6 +34,10 @@
 
 
 def import_protobuf(error_message=""):
+    if is_sentencepiece_available():
+        from sentencepiece import sentencepiece_model_pb2
+
+        return sentencepiece_model_pb2
     if is_protobuf_available():
         import google.protobuf
 
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index f97ef6a63022..b43923df84d7 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -35,7 +35,15 @@
     is_tokenizers_available,
 )
 from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-from transformers.testing_utils import CaptureStderr, require_flax, require_tf, require_tokenizers, require_torch, slow
+from transformers.testing_utils import (
+    CaptureStderr,
+    require_flax,
+    require_sentencepiece,
+    require_tf,
+    require_tokenizers,
+    require_torch,
+    slow,
+)
 
 
 if is_tokenizers_available():
@@ -296,3 +304,13 @@ def test_len_tokenizer(self):
                 self.assertEqual(len(tokenizer), tokenizer.vocab_size + 1)
                 self.assertEqual(len(tokenizer.added_tokens_decoder), added_tokens_size + 1)
                 self.assertEqual(len(tokenizer.added_tokens_encoder), added_tokens_size + 1)
+
+    @require_sentencepiece
+    def test_sentencepiece_cohabitation(self):
+        from sentencepiece import sentencepiece_model_pb2 as _original_protobuf  # noqa: F401
+
+        from transformers.convert_slow_tokenizer import import_protobuf  # noqa: F401
+
+        # Now this will try to import sentencepiece_model_pb2_new.py. This should not fail even if the protobuf
+        # was already imported.
+        import_protobuf()

From dfd31158eefab01952e729588a37c9fcc81f0813 Mon Sep 17 00:00:00 2001
From: Amit Garg <mitgarg17495@gmail.com>
Date: Fri, 13 Sep 2024 05:07:19 -0700
Subject: [PATCH 03/67] [Phi-3] Bug on stale kv cache  (#33129)

* fix long seq bug

* fixed format

* fixed fn copy inconsistency

* fix long seq bug

* fixed format

* fixed fn copy inconsistency

* Addressed comments

* added a unit test

* fixed cache position

* Added a warning msg to the forward fn

* fixed test case
---
 src/transformers/models/phi3/modeling_phi3.py | 23 ++++++++++-
 tests/models/phi3/test_modeling_phi3.py       | 41 +++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index f021c6ce2d33..273b6a8f505e 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -257,7 +257,7 @@ def __init__(self, dim, config, device=None):
 
     @torch.no_grad()
     def forward(self, x, position_ids, seq_len=None):
-        seq_len = torch.max(position_ids) + 1
+        seq_len = seq_len or torch.max(position_ids) + 1
         if seq_len > self.original_max_position_embeddings:
             ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
         else:
@@ -1239,6 +1239,15 @@ def forward(
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
         ```"""
+        if (
+            use_cache
+            and self.config.rope_scaling
+            and cache_position is not None
+            and cache_position[0] == self.config.original_max_position_embeddings
+        ):
+            logger.warning(
+                f"If you are not using the generate method, you may encounter nonsensical outputs after the {self.config.original_max_position_embeddings}th token, as the KV cache needs to be recomputed."
+            )
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1295,7 +1304,6 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
     def prepare_inputs_for_generation(
         self,
         input_ids,
@@ -1308,6 +1316,17 @@ def prepare_inputs_for_generation(
         num_logits_to_keep=None,
         **kwargs,
     ):
+        # When the first time input length reached long and short factor switching point, enforce re-compute cache
+        # It will cause downside of slower at this single token position, however, better than current failure.
+        if (
+            past_key_values
+            and self.config.rope_scaling
+            and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
+        ):
+            past_length = cache_position[0]
+            if past_length <= self.config.original_max_position_embeddings:
+                past_key_values = None
+
         # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
         # Exception 1: when passing input_embeds, input_ids may be missing entries
         # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
index a3f001aba467..ce0a71878877 100644
--- a/tests/models/phi3/test_modeling_phi3.py
+++ b/tests/models/phi3/test_modeling_phi3.py
@@ -442,6 +442,47 @@ def test_model_rope_scaling_from_config(self, scaling_type):
         self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
         self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
 
+    @parameterized.expand([("longrope",)])
+    def test_model_rope_scaling_short_long_factor(self, scaling_type):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        n_factors = config.hidden_size // config.num_key_value_heads // 2
+        config.rope_scaling = {
+            "type": scaling_type,
+            "short_factor": [3.0 for _ in range(n_factors)],
+            "long_factor": [5.0 for _ in range(n_factors)],
+        }
+        input_tensor = ids_tensor([1, 4090], config.vocab_size)
+        model = Phi3ForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+        generation_args_short = {
+            "max_length": config.original_max_position_embeddings,
+            "temperature": 0.0,
+            "use_cache": True,
+            "do_sample": False,
+            "return_dict_in_generate": True,
+        }
+        output_with_short_factor = model.generate(input_tensor, **generation_args_short)
+        keys_with_short_factor = output_with_short_factor.past_key_values[0][0]
+        generation_args_long = {
+            "max_length": config.original_max_position_embeddings + 5,
+            "temperature": 0.0,
+            "use_cache": True,
+            "do_sample": False,
+            "return_dict_in_generate": True,
+            "output_logits": True,
+        }
+        output_with_long_factor = model.generate(input_tensor, **generation_args_long)
+        keys_with_long_factor = output_with_long_factor.past_key_values[0][0]
+        last_token_logits = output_with_long_factor.logits[-1][-1]
+        regenerated_last_token_logits = model(output_with_long_factor.sequences[:, :-1]).logits[0][-1]
+        keys_with_long_factor = keys_with_long_factor[:, :, : config.original_max_position_embeddings - 1, :]
+
+        # KV cache is re-computed after reaching the (`config.original_max_position_embeddings`+1)th token position
+        self.assertFalse(torch.allclose(keys_with_short_factor, keys_with_long_factor, atol=1e-2, rtol=1e-2))
+        # Last token generated using long factor
+        self.assertTrue(torch.allclose(last_token_logits, regenerated_last_token_logits, atol=1e-2, rtol=1e-2))
+
 
 @slow
 @require_torch

From 6cc4dfe3f1e8d421c6d6351388e06e9b123cbfe1 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Fri, 13 Sep 2024 15:06:08 +0200
Subject: [PATCH 04/67] Fix the initialization of the cache when we have multi
 gpu (#33303)

* init cache multi-gpu

* Update src/transformers/generation/utils.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* switch to execution device map

* naming more consistant

* fix

* mutually exclusive device

* added an integration example

* remove useless check

* suggestion from joao + typing

* fix couple of typo and add test

* revert check

---------

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 src/transformers/cache_utils.py      | 40 +++++++++----
 src/transformers/generation/utils.py | 27 +++++++++
 tests/generation/test_utils.py       | 85 ++++++++++++++++++++++++++++
 3 files changed, 141 insertions(+), 11 deletions(-)

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index b3e94da3d7d7..0671157e4470 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -1030,6 +1030,9 @@ class StaticCache(Cache):
             The device on which the cache should be initialized. Should be the same as the layer.
         dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
             The default `dtype` to use when initializing the layer.
+        layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
+            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
+            You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
 
     Example:
 
@@ -1060,6 +1063,7 @@ def __init__(
         device: torch.device = None,
         dtype: torch.dtype = torch.float32,
         max_batch_size: Optional[int] = None,
+        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
     ) -> None:
         super().__init__()
         if max_batch_size is not None:
@@ -1088,16 +1092,20 @@ def __init__(
         # Note: There will be significant perf decrease if switching to use 5D tensors instead.
         cache_shape = (self.batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
         for idx in range(config.num_hidden_layers):
-            new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
-            new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
+            if layer_device_map is not None:
+                layer_device = layer_device_map[idx]
+            else:
+                layer_device = device
+            new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
+            new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
             # Notes:
             # 1. `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
             #     breaks when updating the cache. It can't be used if the cache code is being compiled (but in that case
             #     it is not needed anyway)
             # 2. `torch.export()` requires mutations to be registered as buffers.
             if not is_torchdynamo_compiling():
-                self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=device))
-                self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=device))
+                self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
+                self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
                 new_layer_key_cache = getattr(self, f"key_cache_{idx}")
                 new_layer_value_cache = getattr(self, f"value_cache_{idx}")
                 torch._dynamo.mark_static_address(new_layer_key_cache)
@@ -1130,9 +1138,9 @@ def update(
         Return:
             A tuple containing the updated key and value states.
         """
+
         cache_position = cache_kwargs.get("cache_position")
-        self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device=key_states.device)
-        self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device=value_states.device)
+
         k_out = self.key_cache[layer_idx]
         v_out = self.value_cache[layer_idx]
 
@@ -1201,6 +1209,9 @@ class SlidingWindowCache(StaticCache):
             The device on which the cache should be initialized. Should be the same as the layer.
         dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
             The default `dtype` to use when initializing the layer.
+        layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
+            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
+            You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
 
     Example:
 
@@ -1231,6 +1242,7 @@ def __init__(
         device: torch.device = None,
         dtype: torch.dtype = torch.float32,
         max_batch_size: Optional[int] = None,
+        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
     ) -> None:
         super().__init__()
         if not hasattr(config, "sliding_window") or config.sliding_window is None:
@@ -1247,6 +1259,7 @@ def __init__(
             device=device,
             dtype=dtype,
             max_batch_size=max_batch_size,
+            layer_device_map=layer_device_map,
         )
 
     def update(
@@ -1280,7 +1293,6 @@ def update(
         v_out = v_out[:, :, indices]
 
         try:
-            cache_position.to(device=k_out.device)
             k_out.index_copy_(2, cache_position, key_states)
             v_out.index_copy_(2, cache_position, value_states)
         except NotImplementedError:
@@ -1495,6 +1507,9 @@ class HybridCache(Cache):
             The device on which the cache should be initialized. Should be the same as the layer.
         dtype (torch.dtype, *optional*, defaults to `torch.float32`):
             The default `dtype` to use when initializing the layer.
+        layer_device_map(`Dict[int, Union[str, torch.device, int]]]`, `optional`):
+            Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus.
+            You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`.
 
     Example:
 
@@ -1525,6 +1540,7 @@ def __init__(
         device: Union[torch.device, str] = "cpu",
         dtype: torch.dtype = torch.float32,
         max_batch_size: Optional[int] = None,
+        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
     ) -> None:
         super().__init__()
         if max_batch_size is not None:
@@ -1562,11 +1578,15 @@ def __init__(
             self.head_dim,
         )
         for i in range(config.num_hidden_layers):
+            if layer_device_map is not None:
+                layer_device = layer_device_map[i]
+            else:
+                layer_device = device
             # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
             # breaks when updating the cache.
             cache_shape = global_cache_shape if not self.is_sliding[i] else sliding_cache_shape
-            new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
-            new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
+            new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
+            new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
             torch._dynamo.mark_static_address(new_layer_key_cache)
             torch._dynamo.mark_static_address(new_layer_value_cache)
             self.key_cache.append(new_layer_key_cache)
@@ -1617,8 +1637,6 @@ def update(
     ) -> Tuple[torch.Tensor]:
         cache_position = cache_kwargs.get("cache_position")
         sliding_window = cache_kwargs.get("sliding_window")
-        self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device=key_states.device)
-        self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device=value_states.device)
         k_out = self.key_cache[layer_idx]
         v_out = self.value_cache[layer_idx]
         if sliding_window:
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 17a234c62b28..019eb6c27f18 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1446,12 +1446,39 @@ def _get_cache(
                     # models. May cause trobles with non-text modalities.
                     cache_dtype = self.get_output_embeddings().weight.dtype
 
+            def get_layer_device_map(execution_device_map: Optional[dict] = None):
+                if execution_device_map is None or len(execution_device_map) <= 1:
+                    return None
+                layer_device_map = {}
+                for layer in execution_device_map:
+                    for idx in range(self.config.num_hidden_layers):
+                        if f".{idx}." in f"{layer}.":
+                            layer_device_map[idx] = execution_device_map[layer]
+                            break
+                for idx in range(self.config.num_hidden_layers):
+                    if idx not in layer_device_map:
+                        raise RuntimeError(f"layer {idx} has not been mapped to a device.")
+                return layer_device_map
+
+            execution_device_map = None
+            # Taken from dispatch_model from accelerate.
+            # This is needed here if we don't want to make changes in accelerate in order to save execution_device
+            # For offloaded case, we need to get the execution device, not just the device where it is offloaded
+            if hasattr(self, "hf_device_map"):
+                main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0]
+                execution_device_map = {
+                    name: main_device if device in ["cpu", "disk"] else device
+                    for name, device in self.hf_device_map.items()
+                }
+            layer_device_map = get_layer_device_map(execution_device_map)
+
             cache_kwargs = {
                 "config": self.config if hasattr(self.config, "text_config") else self.config,
                 "max_batch_size": batch_size,
                 "max_cache_len": max_cache_len,
                 "device": device,
                 "dtype": cache_dtype,
+                "layer_device_map": layer_device_map,
             }
             self._cache = cache_cls(**cache_kwargs)
             if requires_cross_attention_cache:
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 65507795c84d..0ed054ad5869 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -3444,6 +3444,91 @@ def test_special_tokens_fall_back_to_model_default(self):
         self.assertTrue(test_bos_id == gen_output[0, 0])
         self.assertTrue(generation_config.bos_token_id is None)
 
+    @pytest.mark.generate
+    @require_torch_multi_gpu
+    def test_generate_with_static_cache_multi_gpu(self):
+        """
+        Tests if the static cache has been set correctly and if generate works correctly when we are using multi-gpus.
+        """
+        # need to split manually as auto doesn't work well with unbalanced model
+        device_map = {"model.embed_tokens": 0, "model.layers.0": 0, "model.layers.1": 1, "model.norm": 1, "lm_head": 0}
+        model = AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-MistralForCausalLM", device_map=device_map
+        )
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
+
+        text = "Hello world"
+        tokenized_inputs = tokenizer([text], return_tensors="pt")
+        input_ids = tokenized_inputs.input_ids.to(torch_device)
+
+        generation_kwargs = {
+            "max_new_tokens": 20,
+            "cache_implementation": "static",
+            "return_dict_in_generate": True,  # Required to return `past_key_values`
+        }
+
+        results = model.generate(input_ids, **generation_kwargs)
+        self.assertTrue(isinstance(results.past_key_values, StaticCache))
+
+        # check device of each layer
+        key_cache_0 = results.past_key_values.key_cache[0]
+        value_cache_0 = results.past_key_values.value_cache[0]
+        self.assertTrue(key_cache_0.device == value_cache_0.device == torch.device(0))
+
+        key_cache_1 = results.past_key_values.key_cache[1]
+        value_cache_1 = results.past_key_values.value_cache[1]
+        self.assertTrue(key_cache_1.device == value_cache_1.device == torch.device(1))
+
+    @pytest.mark.generate
+    @require_torch_multi_gpu
+    def test_init_static_cache_multi_gpu(self):
+        """
+        Tests if the static cache has been set correctly when we initialize it manually in a multi-gpu setup.
+        """
+        # need to split manually as auto doesn't work well with unbalanced model
+        device_map = {"model.embed_tokens": 0, "model.layers.0": 0, "model.layers.1": 1, "model.norm": 1, "lm_head": 0}
+        model = AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-MistralForCausalLM", device_map=device_map
+        )
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM")
+
+        text = "Hello world"
+        tokenized_inputs = tokenizer([text], return_tensors="pt")
+        input_ids = tokenized_inputs.input_ids.to(torch_device)
+
+        generation_kwargs = {
+            "max_new_tokens": 20,
+            "return_dict_in_generate": True,  # Required to return `past_key_values`
+        }
+
+        # TODO: We need to raise a warning in case the cache is not set correctly
+        # with self.assertRaisesRegex(ValueError, "If you are manually initializing the cache"):
+        #     past_key_values = StaticCache(
+        #         config=model.config, batch_size=1, max_cache_len=30, device=torch_device, dtype=model.dtype
+        #     )
+        #     results = model.generate(input_ids, past_key_values=past_key_values, **generation_kwargs)
+
+        # deduced from the device_map : layer 0 on device 0 and layer 1 on device 1
+        layer_device_map = {0: 0, 1: 1}
+        past_key_values = StaticCache(
+            config=model.config,
+            batch_size=1,
+            max_cache_len=30,
+            device=torch_device,
+            dtype=model.dtype,
+            layer_device_map=layer_device_map,
+        )
+        results = model.generate(input_ids, past_key_values=past_key_values, **generation_kwargs)
+
+        # check device of each layer
+        key_cache_0 = results.past_key_values.key_cache[0]
+        value_cache_0 = results.past_key_values.value_cache[0]
+        self.assertTrue(key_cache_0.device == value_cache_0.device == torch.device(0))
+
+        key_cache_1 = results.past_key_values.key_cache[1]
+        value_cache_1 = results.past_key_values.value_cache[1]
+        self.assertTrue(key_cache_1.device == value_cache_1.device == torch.device(1))
+
 
 @require_torch
 class TokenHealingTestCase(unittest.TestCase):

From 0963229e287501bed52ae1dabc17922524de6992 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Fri, 13 Sep 2024 15:07:12 +0200
Subject: [PATCH 05/67] Enable finetuning with torchao quantized model 
 (#33361)

enable training
---
 src/transformers/quantizers/quantizer_torchao.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
index 3b5dfff20904..02ea8294a2d5 100644
--- a/src/transformers/quantizers/quantizer_torchao.py
+++ b/src/transformers/quantizers/quantizer_torchao.py
@@ -166,7 +166,8 @@ def is_serializable(self):
 
     @property
     def is_trainable(self):
-        # torchao does not have official support for QAT (Quantization Aware Training)
-        # but torchao support nf4/PEFT, but it is not integrated yet
-        # TODO: if this is supported in the future, do a version check here.
-        return False
+        supported_quant_types_for_training = [
+            "int8_weight_only",
+            "int8_dynamic_activation_int8_weight",
+        ]
+        return self.quantization_config.quant_type in supported_quant_types_for_training

From e39b6c1c7cdc890b6849b8c9de545fc9590ba871 Mon Sep 17 00:00:00 2001
From: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Date: Fri, 13 Sep 2024 17:15:20 +0200
Subject: [PATCH 06/67] Corrected `Agents and tools` documentation links typos
 (#33471)

* Corrected agents task link typo

* Corrected chat templating link

* Corrected chat templating link 2
---
 docs/source/en/agents.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md
index b100e39f1c95..0b889f4eec86 100644
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
 
 ### What is an agent?
 
-Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling.) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to.
+Large Language Models (LLMs) trained to perform [causal language modeling](./tasks/language_modeling) can tackle a wide range of tasks, but they often struggle with basic tasks like logic, calculation, and search. When prompted in domains in which they do not perform well, they often fail to generate the answer we expect them to.
 
 One approach to overcome this weakness is to create an *agent*.
 
@@ -114,7 +114,7 @@ To start with, please install the `agents` extras in order to install all defaul
 pip install transformers[agents]
 ```
 
-Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating.) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating.
+Build your LLM engine by defining a `llm_engine` method which accepts a list of [messages](./chat_templating) and returns text. This callable also needs to accept a `stop` argument that indicates when to stop generating.
 
 ```python
 from huggingface_hub import login, InferenceClient
@@ -130,7 +130,7 @@ def llm_engine(messages, stop_sequences=["Task"]) -> str:
 ```
 
 You could use any `llm_engine` method as long as:
-1. it follows the [messages format](./chat_templating.md) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`.
+1. it follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`.
 2. it stops generating outputs at the sequences passed in the argument `stop_sequences`
 
 Additionally, `llm_engine` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to llm_engine, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs.

From 7bb1c99800d235791dace10305731f377db8077b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Lemayian=20=E2=9C=A8?=
 <877919+DavidLemayian@users.noreply.github.com>
Date: Sat, 14 Sep 2024 00:25:20 +0300
Subject: [PATCH 07/67] chore: fix typo in comment in
 tokenization_utils_base.py (#33466)

docs: update grammar in comment in tokenization_utils_base.py

small grammar update in tokenization_utils_base.py comment
---
 src/transformers/tokenization_utils_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 93dea5ba09de..b4490578a709 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -3457,7 +3457,7 @@ def pad(
         if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping):
             encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
 
-        # The model's main input name, usually `input_ids`, has be passed for padding
+        # The model's main input name, usually `input_ids`, has been passed for padding
         if self.model_input_names[0] not in encoded_inputs:
             raise ValueError(
                 "You should supply an encoding or a list of encodings to this method "

From 8bd2b1e8c23234cd607ca8d63f53c1edfea27462 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Sat, 14 Sep 2024 12:28:39 +0200
Subject: [PATCH 08/67] Add support for Pixtral (#33449)

* initial commit

* gloups

* updates

* work

* weights match

* nits

* nits

* updates to support the tokenizer :)

* updates

* Pixtral processor (#33454)

* rough outline

* Add in image break and end tokens

* Fix

* Udo some formatting changes

* Set patch_size default

* Fix

* Fix token expansion

* nit in conversion script

* Fix image token list creation

* done

* add expected results

* Process list of list of images (#33465)

* updates

* working image and processor

* this is the expected format

* some fixes

* push current updated

* working mult images!

* add a small integration test

* Uodate configuration docstring

* Formatting

* Config docstring fix

* simplify model test

* fixup modeling and etests

* Return BatchMixFeature in image processor

* fix some copies

* update

* nits

* Update model docstring

* Apply suggestions from code review

* Fix up

* updates

* revert modeling changes

* update

* update

* fix load safe

* addd liscence

* update

* use pixel_values as required by the model

* skip some tests and refactor

* Add pixtral image processing tests (#33476)

* Image processing tests

* Add processing tests

* woops

* defaults reflect pixtral image processor

* fixup post merge

* images -> pixel values

* oups sorry Mr docbuilder

* isort

* fix

* fix processor tests

* small fixes

* nit

* update

* last nits

* oups this was really breaking!

* nits

* is composition needs to be true

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/index.md                       |   1 +
 docs/source/en/model_doc/pixtral.md           |  98 ++++
 src/transformers/__init__.py                  |  13 +-
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   2 +
 .../models/auto/image_processing_auto.py      |   1 +
 src/transformers/models/auto/modeling_auto.py |   1 +
 .../models/auto/processing_auto.py            |   1 +
 .../models/auto/tokenization_auto.py          |   1 +
 .../models/llava/configuration_llava.py       |   2 +-
 src/transformers/models/pixtral/__init__.py   |  70 +++
 .../models/pixtral/configuration_pixtral.py   | 103 ++++
 .../pixtral/convert_pixtral_weights_to_hf.py  | 285 ++++++++++
 .../pixtral/image_processing_pixtral.py       | 519 ++++++++++++++++++
 .../models/pixtral/modeling_pixtral.py        | 517 +++++++++++++++++
 .../models/pixtral/processing_pixtral.py      | 282 ++++++++++
 src/transformers/utils/dummy_pt_objects.py    |  14 +
 .../utils/dummy_vision_objects.py             |   7 +
 tests/models/llava/test_modeling_llava.py     |  47 ++
 tests/models/pixtral/__init__.py              |   0
 .../pixtral/test_image_processing_pixtral.py  | 217 ++++++++
 tests/models/pixtral/test_modeling_pixtral.py | 292 ++++++++++
 .../models/pixtral/test_processor_pixtral.py  | 233 ++++++++
 24 files changed, 2707 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/en/model_doc/pixtral.md
 create mode 100644 src/transformers/models/pixtral/__init__.py
 create mode 100644 src/transformers/models/pixtral/configuration_pixtral.py
 create mode 100644 src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
 create mode 100644 src/transformers/models/pixtral/image_processing_pixtral.py
 create mode 100644 src/transformers/models/pixtral/modeling_pixtral.py
 create mode 100644 src/transformers/models/pixtral/processing_pixtral.py
 create mode 100644 tests/models/pixtral/__init__.py
 create mode 100644 tests/models/pixtral/test_image_processing_pixtral.py
 create mode 100644 tests/models/pixtral/test_modeling_pixtral.py
 create mode 100644 tests/models/pixtral/test_processor_pixtral.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 1c7f62ec6ea7..235ea81a7f1e 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -862,6 +862,8 @@
         title: Perceiver
       - local: model_doc/pix2struct
         title: Pix2Struct
+      - local: model_doc/pixtral
+        title: Pixtral
       - local: model_doc/sam
         title: Segment Anything
       - local: model_doc/siglip
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 8e3a4da8b021..c18426de4c03 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -253,6 +253,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [Phi3](model_doc/phi3)                          |       ✅        |         ❌         |      ❌      |
 |                       [PhoBERT](model_doc/phobert)                       |       ✅        |         ✅         |      ✅      |
 |                    [Pix2Struct](model_doc/pix2struct)                    |       ✅        |         ❌         |      ❌      |
+|                       [Pixtral](model_doc/pixtral)                       |       ❌        |         ❌         |      ❌      |
 |                        [PLBart](model_doc/plbart)                        |       ✅        |         ❌         |      ❌      |
 |                    [PoolFormer](model_doc/poolformer)                    |       ✅        |         ❌         |      ❌      |
 |                     [Pop2Piano](model_doc/pop2piano)                     |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
new file mode 100644
index 000000000000..8df2bf5af5f9
--- /dev/null
+++ b/docs/source/en/model_doc/pixtral.md
@@ -0,0 +1,98 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pixtral
+
+## Overview
+
+The Pixtral model was released by the Mistral AI team on [Vllm](https://github.com/vllm-project/vllm/pull/8377), where a version of the code can be found!
+
+
+Tips:
+
+- Pixtral is a multimodal model, the main contribution is the 2d ROPE on the images, and support for arbitrary image size (the images are not padded together nor are they resized)
+- This model follows the `Llava` familiy, meaning image embeddings are placed instead of the `[IMG]` token placeholders. 
+- The format for one or mulitple prompts is the following:
+```
+"<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
+```
+Then, the processor will replace each `[IMG]` token with  a number of `[IMG]` token that depends on the height and the width of the image. Each *row* of the image is separated by a `[IMG_BREAK]` token, and each image is separated by a  `[IMG_END]` token.
+
+This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [ArthurZ](https://huggingface.co/ArthurZ)
+
+Here is an example of how to run it:
+
+```python 
+from transformers import LlavaForConditionalGeneration, AutoProcessor
+from PIL import Image
+
+model_id = "hf-internal-testing/pixtral-12b"
+model = LlavaForConditionalGeneration.from_pretrained(model_id).to("cuda")
+processor = AutoProcessor.from_pretrained(model_id)
+
+IMG_URLS = [
+    "https://picsum.photos/id/237/400/300",
+    "https://picsum.photos/id/231/200/300",
+    "https://picsum.photos/id/27/500/500",
+    "https://picsum.photos/id/17/150/600",
+]
+PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
+
+inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
+generate_ids = model.generate(**inputs, max_new_tokens=500)
+ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+EXPECTED_GENERATION = """
+Describe the images.
+Sure, let's break down each image description:
+
+1. **Image 1:**
+   - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
+   - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.
+
+2. **Image 2:**
+   - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
+   - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.
+
+3. **Image 3:**
+   - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
+   - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.
+
+4. **Image 4:**
+   - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
+   - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
+
+Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
+"""
+
+```
+## PixtralVisionConfig
+
+[[autodoc]] PixtralVisionConfig
+
+## PixtralModel
+
+[[autodoc]] PixtralModel
+    - forward
+
+## PixtralImageProcessor
+
+[[autodoc]] PixtralImageProcessor
+    - preprocess
+
+## PixtralProcessor
+
+[[autodoc]] PixtralProcessor
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 00cc67915f36..36775d8454ab 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -649,6 +649,7 @@
         "Pix2StructTextConfig",
         "Pix2StructVisionConfig",
     ],
+    "models.pixtral": ["PixtralProcessor", "PixtralVisionConfig"],
     "models.plbart": ["PLBartConfig"],
     "models.poolformer": ["PoolFormerConfig"],
     "models.pop2piano": ["Pop2PianoConfig"],
@@ -1199,6 +1200,7 @@
     _import_structure["models.owlvit"].extend(["OwlViTFeatureExtractor", "OwlViTImageProcessor"])
     _import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
     _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"])
+    _import_structure["models.pixtral"].append("PixtralImageProcessor")
     _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
     _import_structure["models.pvt"].extend(["PvtImageProcessor"])
     _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"])
@@ -1359,7 +1361,6 @@
             "AlignVisionModel",
         ]
     )
-
     _import_structure["models.altclip"].extend(
         [
             "AltCLIPModel",
@@ -2977,6 +2978,7 @@
             "Pix2StructVisionModel",
         ]
     )
+    _import_structure["models.pixtral"].extend(["PixtralModel", "PixtralPreTrainedModel"])
     _import_structure["models.plbart"].extend(
         [
             "PLBartForCausalLM",
@@ -5434,6 +5436,10 @@
         Pix2StructTextConfig,
         Pix2StructVisionConfig,
     )
+    from .models.pixtral import (
+        PixtralProcessor,
+        PixtralVisionConfig,
+    )
     from .models.plbart import PLBartConfig
     from .models.poolformer import (
         PoolFormerConfig,
@@ -6009,6 +6015,7 @@
         from .models.owlvit import OwlViTFeatureExtractor, OwlViTImageProcessor
         from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor
         from .models.pix2struct import Pix2StructImageProcessor
+        from .models.pixtral import PixtralImageProcessor
         from .models.poolformer import (
             PoolFormerFeatureExtractor,
             PoolFormerImageProcessor,
@@ -7448,6 +7455,10 @@
             Pix2StructTextModel,
             Pix2StructVisionModel,
         )
+        from .models.pixtral import (
+            PixtralModel,
+            PixtralPreTrainedModel,
+        )
         from .models.plbart import (
             PLBartForCausalLM,
             PLBartForConditionalGeneration,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 26b96def67d9..2022048cd455 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -187,6 +187,7 @@
     phi3,
     phobert,
     pix2struct,
+    pixtral,
     plbart,
     poolformer,
     pop2piano,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index fa1a7fb88eaf..2cd7d550d90b 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -205,6 +205,7 @@
         ("phi", "PhiConfig"),
         ("phi3", "Phi3Config"),
         ("pix2struct", "Pix2StructConfig"),
+        ("pixtral", "PixtralVisionConfig"),
         ("plbart", "PLBartConfig"),
         ("poolformer", "PoolFormerConfig"),
         ("pop2piano", "Pop2PianoConfig"),
@@ -509,6 +510,7 @@
         ("phi3", "Phi3"),
         ("phobert", "PhoBERT"),
         ("pix2struct", "Pix2Struct"),
+        ("pixtral", "Pixtral"),
         ("plbart", "PLBart"),
         ("poolformer", "PoolFormer"),
         ("pop2piano", "Pop2Piano"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index c83c43518a6a..95d9ddef8f79 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -114,6 +114,7 @@
             ("owlvit", ("OwlViTImageProcessor",)),
             ("perceiver", ("PerceiverImageProcessor",)),
             ("pix2struct", ("Pix2StructImageProcessor",)),
+            ("pixtral", ("PixtralImageProcessor",)),
             ("poolformer", ("PoolFormerImageProcessor",)),
             ("pvt", ("PvtImageProcessor",)),
             ("pvt_v2", ("PvtImageProcessor",)),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 45a9c4d0d078..e0d15f1e2365 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -193,6 +193,7 @@
         ("persimmon", "PersimmonModel"),
         ("phi", "PhiModel"),
         ("phi3", "Phi3Model"),
+        ("pixtral", "PixtralModel"),
         ("plbart", "PLBartModel"),
         ("poolformer", "PoolFormerModel"),
         ("prophetnet", "ProphetNetModel"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 7f49e0e8d997..82d325248eab 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -82,6 +82,7 @@
         ("owlvit", "OwlViTProcessor"),
         ("paligemma", "PaliGemmaProcessor"),
         ("pix2struct", "Pix2StructProcessor"),
+        ("pixtral", "PixtralProcessor"),
         ("pop2piano", "Pop2PianoProcessor"),
         ("qwen2_audio", "Qwen2AudioProcessor"),
         ("qwen2_vl", "Qwen2VLProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index c8eb06db04a0..e735579108d8 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -385,6 +385,7 @@
             ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("phobert", ("PhobertTokenizer", None)),
             ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
+            ("pixtral", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
             ("prophetnet", ("ProphetNetTokenizer", None)),
             ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py
index f2338a7c5a5d..3a4cb09855f0 100644
--- a/src/transformers/models/llava/configuration_llava.py
+++ b/src/transformers/models/llava/configuration_llava.py
@@ -73,7 +73,7 @@ class LlavaConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava"
-    is_composition = False
+    is_composition = True
 
     def __init__(
         self,
diff --git a/src/transformers/models/pixtral/__init__.py b/src/transformers/models/pixtral/__init__.py
new file mode 100644
index 000000000000..e09ed8e60127
--- /dev/null
+++ b/src/transformers/models/pixtral/__init__.py
@@ -0,0 +1,70 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_pixtral": ["PixtralVisionConfig"],
+    "processing_pixtral": ["PixtralProcessor"],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_pixtral"] = [
+        "PixtralModel",
+        "PixtralPreTrainedModel",
+    ]
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_pixtral"] = ["PixtralImageProcessor"]
+
+
+if TYPE_CHECKING:
+    from .configuration_pixtral import PixtralProcessor, PixtralVisionConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_pixtral import (
+            PixtralModel,
+            PixtralPreTrainedModel,
+        )
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_pixtral import PixtralImageProcessor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
new file mode 100644
index 000000000000..dcc1e458ca78
--- /dev/null
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pixtral model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class PixtralVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PixtralModel`]. It is used to instantiate an
+    Pixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Pixtral-9B.
+
+    e.g. [pixtral-hf/pixtral-9b](https://huggingface.co/pixtral-hf/pixtral-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of input channels in the input images.
+        image_size (`int`, *optional*, defaults to 1024):
+            Max dimension of the input images.
+        patch_size (`int`, *optional*, defaults to 16):
+            Size of the image patches.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            Activation function used in the hidden layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for the attention layers.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie the word embeddings with the input embeddings.
+
+    Example:
+
+    ```python
+    >>> from transformers import PixtralModel, PixtralVisionConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a Pixtral 12B style configuration
+    >>> config = PixtralVisionConfig()
+
+    >>> # Initializing a model from the pixtral 12B style configuration
+    >>> model = PixtralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "pixtral"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4096,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=1024,
+        patch_size=16,
+        hidden_act="gelu",
+        attention_dropout=0.0,
+        rope_theta=10000.0,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.rope_theta = rope_theta
+        self.tie_word_embeddings = tie_word_embeddings
+        self.head_dim = hidden_size // num_attention_heads
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
new file mode 100644
index 000000000000..c4190082d994
--- /dev/null
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -0,0 +1,285 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import regex as re
+import torch
+from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+from safetensors.torch import load_file as safe_load_file
+from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
+
+from transformers import (
+    LlavaConfig,
+    LlavaForConditionalGeneration,
+    MistralConfig,
+    PixtralImageProcessor,
+    PixtralProcessor,
+    PixtralVisionConfig,
+    PreTrainedTokenizerFast,
+)
+from transformers.convert_slow_tokenizer import bytes_to_unicode
+
+
+"""
+# Here is how to get the original tokens!
+model_name = "mistralai/Pixtral-12B-2409"
+tok = MistralTokenizer.from_model(model_name)
+
+from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk
+
+EXPECTED_TOKENS = tok.encode_chat_completion(
+    ChatCompletionRequest(
+        messages=[
+            UserMessage(
+                content=[
+                    TextChunk(text="Describe the images"),
+                ] + [ImageChunk(image=img) for img in IMG_URLS]
+            )
+        ],
+        model="pixtral",
+    )
+)
+assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS
+"""
+
+OLD_KEY_TO_NEW_KEY_MAPPING = {
+    # Layer Normalization Weights
+    r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight": r"vision_tower.transformer.layers.\1.attention_norm.weight",
+    r"vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"vision_tower.transformer.layers.\1.ffn_norm.weight",
+    # Self Attention Projections
+    r"vision_encoder.transformer.layers.(\d+).attention.wq.weight": r"vision_tower.transformer.layers.\1.attention.q_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wk.weight": r"vision_tower.transformer.layers.\1.attention.k_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wv.weight": r"vision_tower.transformer.layers.\1.attention.v_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wo.weight": r"vision_tower.transformer.layers.\1.attention.o_proj.weight",
+    # MLP Projections
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.feed_forward.down_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.feed_forward.up_proj.weight",
+    # Additional mappings
+    r"vision_encoder": r"vision_tower",
+    r"vision_language_adapter.w_in": r"multi_modal_projector.linear_1",
+    r"vision_language_adapter.w_out": r"multi_modal_projector.linear_2",
+    r"layers.(\d+).attention.wq.weight": r"language_model.model.layers.\1.self_attn.q_proj.weight",
+    r"layers.(\d+).attention.wk.weight": r"language_model.model.layers.\1.self_attn.k_proj.weight",
+    r"layers.(\d+).attention.wv.weight": r"language_model.model.layers.\1.self_attn.v_proj.weight",
+    r"layers.(\d+).attention.wo.weight": r"language_model.model.layers.\1.self_attn.o_proj.weight",
+    r"layers.(\d+).feed_forward.w1.weight": r"language_model.model.layers.\1.mlp.gate_proj.weight",
+    r"layers.(\d+).feed_forward.w2.weight": r"language_model.model.layers.\1.mlp.down_proj.weight",
+    r"layers.(\d+).feed_forward.w3.weight": r"language_model.model.layers.\1.mlp.up_proj.weight",
+    r"layers.(\d+).ffn_norm.weight": r"language_model.model.layers.\1.post_attention_layernorm.weight",
+    r"layers.(\d+).attention_norm.weight": r"language_model.model.layers.\1.input_layernorm.weight",
+    r"tok_embeddings.weight": r"language_model.model.embed_tokens.weight",
+    r"output.weight": r"language_model.lm_head.weight",
+    r"norm.weight": r"language_model.model.norm.weight",
+}
+
+
+class MistralConverter:
+    """
+    A general tiktoken converter.
+    """
+
+    def __init__(
+        self,
+        vocab=None,
+        pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
+        add_prefix_space=False,
+        additional_special_tokens=None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args)
+        self.vocab = vocab
+        self.pattern = pattern
+        self.add_prefix_space = add_prefix_space
+        self.additional_special_tokens = additional_special_tokens
+
+    def extract_vocab_merges_from_model(self, vocab: str):
+        bpe_ranks = vocab
+        byte_encoder = bytes_to_unicode()
+
+        def token_bytes_to_string(b):
+            return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
+
+        merges = []
+        vocab = {}
+        for idx, (token, rank) in enumerate(bpe_ranks.items()):
+            if token not in self.additional_special_tokens:
+                vocab[token_bytes_to_string(token)] = idx
+                if len(token) == 1:
+                    continue
+                local = []
+                for index in range(1, len(token)):
+                    piece_l, piece_r = token[:index], token[index:]
+                    if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks:
+                        local.append((piece_l, piece_r, rank))
+                local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False)
+                merges.extend(local)
+            else:
+                vocab[token] = idx
+        merges = sorted(merges, key=lambda val: val[2], reverse=False)
+        merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges]
+        return vocab, merges
+
+    def tokenizer(self):
+        vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab)
+        tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False))
+        if hasattr(tokenizer.model, "ignore_merges"):
+            tokenizer.model.ignore_merges = True
+        return tokenizer
+
+    def converted(self) -> Tokenizer:
+        tokenizer = self.tokenizer()
+        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False),
+                pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False),
+            ]
+        )
+        tokenizer.decoder = decoders.ByteLevel()
+        tokenizer.add_special_tokens(self.additional_special_tokens)
+
+        tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+
+        return tokenizer
+
+
+def convert_mistral_tokenizer():
+    model_name = "mistralai/Pixtral-12B-2409"
+
+    tokenizer = MistralTokenizer.from_model(model_name)
+
+    vocab = tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial
+    all_special = [
+        token.value if hasattr(token, "value") else token
+        for token in tokenizer.instruct_tokenizer.tokenizer._all_special_tokens
+    ]
+    specials_tokens = {token: all_special.index(token) for token in all_special}
+    specials_tokens.update(vocab)
+    vocab = specials_tokens
+
+    tokenizer = PreTrainedTokenizerFast(
+        tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(),
+        bos_token="<s>",
+        unk_token="<unk>",
+        eos_token="</s>",
+    )
+    tokenizer.model_input_names = ["input_ids", "attention_mask"]
+
+    return tokenizer
+
+
+def permute_for_rope(value, n_heads, config):
+    dim1 = value.shape[0]
+    dim2 = config.hidden_size
+    return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+
+def convert_dictionnary(original_state_dict, vision_config, text_config):
+    new_dict = {}
+
+    all_keys = "\n" + "\n".join(original_state_dict.keys())
+    old_keys = all_keys
+    for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items():
+        all_keys = re.sub(r"\n" + old, r"\n" + new, all_keys)
+
+    OLD_TO_NEW = dict(zip(old_keys.split("\n"), all_keys.split("\n")))
+
+    for key, value in original_state_dict.items():
+        new_key = OLD_TO_NEW[key]
+        if "vision_encoder" in key:
+            _config = vision_config
+            num_attention_heads = _config.num_attention_heads
+        else:
+            _config = text_config
+            if "q_proj" in new_key:
+                num_attention_heads = _config.num_attention_heads
+            if "k_proj" in new_key:
+                num_attention_heads = _config.num_key_value_heads
+            # convert the text model (basically mistral model)
+
+        if "q_proj" in new_key or "k_proj" in new_key:
+            value = permute_for_rope(value, num_attention_heads, _config)
+
+        new_dict[new_key] = value
+    return new_dict
+
+
+def convert_mistral_model(input_dir, output_dir):
+    text_config = MistralConfig(
+        attention_dropout=0.0,
+        bos_token_id=1,
+        eos_token_id=2,
+        head_dim=128,
+        hidden_act="silu",
+        hidden_size=5120,
+        initializer_range=0.02,
+        intermediate_size=14336,
+        max_position_embeddings=1024000,
+        model_type="mistral",
+        num_attention_heads=32,
+        num_hidden_layers=40,
+        num_key_value_heads=8,
+        rms_norm_eps=1e-05,
+        rope_theta=1000000000.0,
+        sliding_window=None,
+        tie_word_embeddings=False,
+        vocab_size=131072,
+    )
+
+    vision_config = PixtralVisionConfig()
+    config = LlavaConfig(
+        vision_config,
+        text_config,
+        vision_feature_layer=-1,
+        image_token_index=10,
+        vision_feature_select_strategy="full",
+        image_seq_length=1,
+    )
+    config.architectures = ["LlavaForConditionalGeneration"]
+    config.save_pretrained(output_dir)
+
+    original_state_dict = safe_load_file(f"{input_dir}/consolidated.safetensors")
+    new_dict = convert_dictionnary(original_state_dict, vision_config, text_config)
+
+    with torch.device("meta"):
+        model = LlavaForConditionalGeneration(config)
+    model.load_state_dict(new_dict, strict=True, assign=True)
+
+    model.save_pretrained(output_dir)
+
+    tokenizer = convert_mistral_tokenizer()
+    image_processor = PixtralImageProcessor()
+    processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
+    processor.save_pretrained(output_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
+
+    args = parser.parse_args()
+    convert_mistral_model(args.input_dir, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
new file mode 100644
index 000000000000..c6d18420bec5
--- /dev/null
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -0,0 +1,519 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Pixtral."""
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, is_vision_available, logging
+from ...utils.import_utils import requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class BatchMixFeature(BatchFeature):
+    def to(self, *args, **kwargs) -> "BatchMixFeature":
+        """
+        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
+        different `dtypes` and sending the `BatchFeature` to a different `device`.
+
+        Args:
+            args (`Tuple`):
+                Will be passed to the `to(...)` function of the tensors.
+            kwargs (`Dict`, *optional*):
+                Will be passed to the `to(...)` function of the tensors.
+
+        Returns:
+            [`BatchFeature`]: The same instance after modification.
+        """
+        requires_backends(self, ["torch"])
+        import torch  # noqa
+
+        new_data = {}
+        device = kwargs.get("device")
+        # Check if the args are a device or a dtype
+        if device is None and len(args) > 0:
+            # device should be always the first argument
+            arg = args[0]
+            if is_torch_dtype(arg):
+                # The first argument is a dtype
+                pass
+            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+                device = arg
+            else:
+                # it's something else
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+        for k, v in self.items():
+            # check if v is a floating point
+            if isinstance(v, list):
+                new_data[k] = [
+                    element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
+                ]
+            elif torch.is_floating_point(v):
+                # cast and send to device
+                new_data[k] = v.to(*args, **kwargs)
+            elif device is not None:
+                new_data[k] = v.to(device=device)
+            else:
+                new_data[k] = v
+        self.data = new_data
+        return self
+
+
+# Copied from transformers.models.idefics2.image_processing_idefics2.make_list_of_images
+def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
+    """
+    Convert a single image or a list of images to a list of numpy arrays.
+
+    Args:
+        images (`ImageInput`):
+            A single image or a list of images.
+
+    Returns:
+        A list of numpy arrays.
+    """
+    # If it's a single image, convert it to a list of lists
+    if is_valid_image(images):
+        images = [[images]]
+    # If it's a list of images, it's a single batch, so convert it to a list of lists
+    elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]):
+        images = [images]
+    # If it's a list of batches, it's already in the right format
+    elif (
+        isinstance(images, (list, tuple))
+        and len(images) > 0
+        and isinstance(images[0], (list, tuple))
+        and is_valid_image(images[0][0])
+    ):
+        pass
+    else:
+        raise ValueError(
+            "Invalid input type. Must be a single image, a list of images, or a list of batches of images."
+        )
+    return images
+
+
+# Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.
+def convert_to_rgb(image: ImageInput) -> ImageInput:
+    """
+    Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+    as is.
+    Args:
+        image (Image):
+            The image to convert.
+    """
+    requires_backends(convert_to_rgb, ["vision"])
+
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    if image.mode == "RGB":
+        return image
+
+    # First we convert to RGBA to set background to white.
+    image = image.convert("RGBA")
+
+    # Create a new image with a white background.
+    new_image = PIL.Image.new("RGBA", image.size, "WHITE")
+    new_image.paste(image, (0, 0), image)
+    new_image = new_image.convert("RGB")
+    return new_image
+
+
+def _num_image_tokens(image_size: Tuple[int, int], patch_size: Tuple[int, int]) -> int:
+    """
+    Calculate the number of image tokens given the image size and patch size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The size of the image as `(height, width)`.
+        patch_size (`Tuple[int, int]`):
+            The patch size as `(height, width)`.
+
+    Returns:
+        `int`: The number of image tokens.
+    """
+    height, width = image_size
+    patch_height, patch_width = patch_size if isinstance(patch_size, (tuple, list)) else (patch_size, patch_size)
+    num_width_tokens = (width - 1) // patch_width + 1
+    num_height_tokens = (height - 1) // patch_height + 1
+    return num_height_tokens, num_width_tokens
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    patch_size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]`):
+            Max image size an input image can be. Must be a dictionary with the key "longest_edge".
+        patch_size (`int` or `Tuple[int, int]`):
+            The patch_size as `(height, width)` to use for resizing the image. If patch_size is an integer, `(patch_size, patch_size)`
+            will be used
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
+
+    Returns:
+        `tuple`: The target (height, width) dimension of the output image after resizing.
+    """
+    max_height, max_width = size if isinstance(size, (tuple, list)) else (size, size)
+    patch_height, patch_width = patch_size if isinstance(patch_size, (tuple, list)) else (patch_size, patch_size)
+    height, width = get_image_size(input_image, input_data_format)
+
+    ratio = max(height / max_height, width / max_width)
+
+    if ratio > 1:
+        # Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results
+        height = int(np.ceil(height / ratio))
+        width = int(np.ceil(width / ratio))
+
+    num_height_tokens, num_width_tokens = _num_image_tokens((height, width), (patch_height, patch_width))
+    return num_height_tokens * patch_height, num_width_tokens * patch_width
+
+
+# Hack to get tensor conversion used in BatchFeature without batching the images
+def _get_is_as_tensor_fns(tensor_type: Union[str, TensorType]) -> Tuple[Callable, Callable]:
+    return BatchFeature()._get_is_as_tensor_fns(tensor_type)
+
+
+def convert_to_tensor(array, tensor_type: Union[str, TensorType]) -> Any:
+    is_tensor, as_tensor = _get_is_as_tensor_fns(tensor_type)
+    if is_tensor(array):
+        return array
+    return as_tensor(array)
+
+
+class PixtralImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Pixtral image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"longest_edge": 1024}`):
+            Size of the maximum dimension of either the height or width dimension of the image. Used to control how
+            images are resized. If either the height or width are greater than `size["longest_edge"]` then both the height and width are rescaled by `height / ratio`, `width /ratio` where `ratio = max(height / longest_edge, width / longest_edge)`
+        patch_size (`Dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
+            Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        patch_size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"longest_edge": 1024}
+        patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+        patch_size = get_size_dict(patch_size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.patch_size = patch_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.do_convert_rgb = do_convert_rgb
+        self._valid_processor_keys = [
+            "images",
+            "do_resize",
+            "size",
+            "patch_size",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_rgb",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        patch_size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dict containing the longest possible edge of the image.
+            patch_size (`Dict[str, int]`):
+                Patch size used to calculate the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "longest_edge" in size:
+            size = (size["longest_edge"], size["longest_edge"])
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("size must contain either 'longest_edge' or 'height' and 'width'.")
+
+        if "height" in patch_size and "width" in patch_size:
+            patch_size = (patch_size["height"], patch_size["width"])
+        else:
+            raise ValueError("patch_size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            patch_size=patch_size,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        patch_size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Describes the maximum input dimensions to the model.
+            patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Patch size in the model. Used to calculate the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_size = get_size_dict(patch_size, default_to_square=True)
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        images_list = make_list_of_images(images)
+
+        if not valid_images(images_list[0]):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images_list = [[convert_to_rgb(image) for image in images] for images in images_list]
+
+        # All transformations expect numpy arrays.
+        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
+
+        if is_scaled_image(images_list[0][0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images_list[0][0])
+
+        batch_images = []
+        batch_image_sizes = []
+        for sample_images in images_list:
+            images = []
+            image_sizes = []
+            for image in sample_images:
+                if do_resize:
+                    image = self.resize(
+                        image=image,
+                        size=size,
+                        patch_size=patch_size,
+                        resample=resample,
+                        input_data_format=input_data_format,
+                    )
+
+                if do_rescale:
+                    image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+                if do_normalize:
+                    image = self.normalize(
+                        image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                    )
+
+                images.append(image)
+                image_sizes.append(get_image_size(image, input_data_format))
+            batch_images.append(images)
+            batch_image_sizes.append(image_sizes)
+
+        images_list = [
+            [to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images]
+            for images in batch_images
+        ]
+
+        # Convert to tensor type outside of BatchFeature to avoid batching the images of different sizes
+        images_list = [[convert_to_tensor(image, return_tensors) for image in images] for images in images_list]
+        return BatchMixFeature(data={"pixel_values": images_list, "image_sizes": batch_image_sizes}, tensor_type=None)
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
new file mode 100644
index 000000000000..0e10c78b7852
--- /dev/null
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -0,0 +1,517 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Pixtral model."""
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ... import PreTrainedModel
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_pixtral import PixtralVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def position_ids_in_meshgrid(patch_embeds_list, max_width):
+    positions = []
+    for patch in patch_embeds_list:
+        height, width = patch.shape[-2:]
+        mesh = torch.meshgrid(torch.arange(height), torch.arange(width), indexing="ij")
+        h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2, -1)
+        ids = h_grid * max_width + v_grid
+        positions.append(ids[:, 0])
+    return torch.cat(positions)
+
+
+class PixtralRotaryEmbedding(nn.Module):
+    """
+    The key with pixtral embedding is just that you have a frequency for each pixel positions.
+    If you have height x width pixels (or embedding pixels)
+
+    then the frequency used for ROPE is given by indexing the pre_computed frequency on the
+    width and height.
+
+    What you output is of dimension batch, height * width, dim with dim the embed dim.
+
+    This simply means that for each image hidden states, you are going to add
+    a corresponding positional embedding, based on it's index in the grid.
+    """
+
+    def __init__(self, config, device):
+        super().__init__()
+        self.rope_type = "default"
+        self.dim = config.head_dim
+        self.base = config.rope_theta
+        max_patches_per_side = config.image_size // config.patch_size
+        freqs = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
+
+        h = torch.arange(max_patches_per_side, device=freqs.device)
+        w = torch.arange(max_patches_per_side, device=freqs.device)
+
+        freqs_h = torch.outer(h, freqs[::2]).float()
+        freqs_w = torch.outer(w, freqs[1::2]).float()
+        inv_freq = torch.cat(
+            [
+                freqs_h[:, None, :].repeat(1, max_patches_per_side, 1),
+                freqs_w[None, :, :].repeat(max_patches_per_side, 1, 1),
+            ],
+            dim=-1,
+        ).reshape(-1, self.dim // 2)  # we reshape to only index on the position indexes, not tuple of indexes
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+
+        # TODO maybe make it torch compatible later on. We can also just slice
+        self.register_buffer("inv_freq", torch.cat((inv_freq, inv_freq), dim=-1), persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        freqs = self.inv_freq[position_ids]
+        # position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            emb = freqs
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class PixtralAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, patches, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=0)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, patches, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Pixtral
+class PixtralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Pixtral
+class PixtralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        PixtralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class PixtralAttentionLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+        self.feed_forward = PixtralMLP(config)
+        self.attention = PixtralAttention(config)
+        self.ffn_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.attention_norm(hidden_states)
+        hidden_states, attn_weights = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.ffn_norm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+
+
+class PixtralTransformer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = torch.nn.ModuleList()
+        for _ in range(config.num_hidden_layers):
+            self.layers.append(PixtralAttentionLayer(config))
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings=position_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=[hidden_states], attentions=all_attentions
+        )
+
+
+PIXTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`PixtralVisionConfig`] or [`PixtralVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    PIXTRAL_START_DOCSTRING,
+)
+class PixtralPreTrainedModel(PreTrainedModel):
+    config_class = PixtralVisionConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PixtralVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        # important: this ported version of Pixtral isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/LLaVA/tree/main/pixtral should serve for that purpose
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+PIXTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values: list of N_img images of variable sizes,
+                each of shape (C, H, W)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def generate_block_attention_mask(patch_embeds_list, tensor):
+    dtype = tensor.dtype
+    device = tensor.device
+    seq_len = tensor.shape[1]
+    d_min = torch.finfo(dtype).min
+    causal_mask = torch.full((seq_len, seq_len), fill_value=d_min, dtype=dtype, device=device)
+
+    block_end_idx = torch.tensor(patch_embeds_list).cumsum(-1)
+    block_start_idx = torch.tensor([0] + patch_embeds_list[:-1]).cumsum(-1)
+    for start, end in zip(block_start_idx, block_end_idx):
+        causal_mask[start:end, start:end] = 0
+
+    causal_mask = causal_mask[None, None, :, :].expand(tensor.shape[0], 1, -1, -1)
+    return causal_mask
+
+
+@add_start_docstrings(
+    """The PIXTRAL model which consists of a vision backbone and a language model.""",
+    PIXTRAL_START_DOCSTRING,
+)
+class PixtralModel(PixtralPreTrainedModel):
+    base_model_prefix = "vision_encoder"
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.patch_conv = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=config.hidden_size,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+        self.ln_pre = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+        self.transformer = PixtralTransformer(config)
+        self.patch_positional_embedding = PixtralRotaryEmbedding(config, device=self.device)
+
+    @add_start_docstrings_to_model_forward(PIXTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        pixel_values: List[torch.Tensor],
+        output_hidden_states: Optional[bool] = False,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        *args,
+        **kwargs,
+    ) -> Union[Tuple, BaseModelOutput]:
+        """
+        Returns:
+            pixel_values: tensor of token features for
+                all tokens of all images of shape (N_toks, D)
+        """
+        # pass images through initial convolution independently
+        patch_embeds_list = [self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in pixel_values]
+
+        # flatten to a single sequence
+        patch_embeds = torch.cat([p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
+        patch_embeds = self.ln_pre(patch_embeds)
+
+        # positional embeddings
+        position_ids = position_ids_in_meshgrid(
+            patch_embeds_list, max_width=self.config.image_size // self.config.patch_size
+        ).to(self.device)
+
+        position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
+        attention_mask = generate_block_attention_mask(
+            [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
+        )
+        return self.transformer(patch_embeds, attention_mask, position_embedding)
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
new file mode 100644
index 000000000000..9362703c8aa6
--- /dev/null
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -0,0 +1,282 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Pixtral.
+"""
+
+from typing import List, Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, is_valid_image, load_image
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, logging, requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.idefics2.processing_idefics2.is_url
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+
+
+# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem)
+
+
+# Copied from transformers.models.pixtral.image_processing_pixtral.BatchMixFeature
+class BatchMixFeature(BatchFeature):
+    def to(self, *args, **kwargs) -> "BatchMixFeature":
+        """
+        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
+        different `dtypes` and sending the `BatchFeature` to a different `device`.
+
+        Args:
+            args (`Tuple`):
+                Will be passed to the `to(...)` function of the tensors.
+            kwargs (`Dict`, *optional*):
+                Will be passed to the `to(...)` function of the tensors.
+
+        Returns:
+            [`BatchFeature`]: The same instance after modification.
+        """
+        requires_backends(self, ["torch"])
+        import torch  # noqa
+
+        new_data = {}
+        device = kwargs.get("device")
+        # Check if the args are a device or a dtype
+        if device is None and len(args) > 0:
+            # device should be always the first argument
+            arg = args[0]
+            if is_torch_dtype(arg):
+                # The first argument is a dtype
+                pass
+            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+                device = arg
+            else:
+                # it's something else
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+        for k, v in self.items():
+            # check if v is a floating point
+            if isinstance(v, list):
+                new_data[k] = [
+                    element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
+                ]
+            elif torch.is_floating_point(v):
+                # cast and send to device
+                new_data[k] = v.to(*args, **kwargs)
+            elif device is not None:
+                new_data[k] = v.to(device=device)
+            else:
+                new_data[k] = v
+        self.data = new_data
+        return self
+
+
+class PixtralProcessor(ProcessorMixin):
+    r"""
+    Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor.
+
+    [`PixtralProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~PixtralProcessor.__call__`] and [`~PixtralProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`PixtralImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        patch_size (`int`, *optional*, defaults to 16):
+            Patch size from the vision tower.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"[IMG]"`):
+            Special token used to denote image location.
+        image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
+            Special token used to denote the end of a line of pixels in an image.
+        image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
+            Special token used to denote the end of an image input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "patch_size",
+        "image_token",
+        "image_break_token",
+        "image_end_token",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size: int = 16,
+        chat_template=None,
+        image_token="[IMG]",  # set the default and let users change if they have peculiar special tokens in rare cases
+        image_break_token="[IMG_BREAK]",
+        image_end_token="[IMG_END]",
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.image_token = image_token
+        self.image_break_token = image_break_token
+        self.image_end_token = image_end_token
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchMixFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+            `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is not None:
+            if is_image_or_image_url(images):
+                images = [[images]]
+            elif isinstance(images, list) and is_image_or_image_url(images[0]):
+                images = [images]
+            elif (
+                not isinstance(images, list)
+                and not isinstance(images[0], list)
+                and not is_image_or_image_url(images[0][0])
+            ):
+                raise ValueError(
+                    "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+                )
+            images = [[load_image(im) for im in sample] for sample in images]
+            image_inputs = self.image_processor(images, patch_size=self.patch_size, return_tensors=return_tensors)
+        else:
+            image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = text
+        if image_inputs.get("pixel_values") is not None:
+            # Replace the image token with the expanded image token sequence
+            images = image_inputs["pixel_values"]
+            image_sizes = image_inputs.pop("image_sizes")
+            prompt_strings = []
+
+            for sample_images, sample_image_sizes, sample in zip(images, image_sizes, text):
+                replace_strings = []
+                # First calculate the number of tokens needed for each image and put in a placeholder
+                for image, image_size in zip(sample_images, sample_image_sizes):
+                    height, width = image_size
+                    num_height_tokens = height // self.patch_size
+                    num_width_tokens = width // self.patch_size
+                    replace_tokens = [
+                        [self.image_token] * num_width_tokens + [self.image_break_token]
+                    ] * num_height_tokens
+                    # Flatten list
+                    replace_tokens = [item for sublist in replace_tokens for item in sublist]
+                    replace_tokens[-1] = self.image_end_token
+                    replace_str = "".join(replace_tokens)
+                    replace_strings.append(replace_str)
+                    sample = sample.replace(self.image_token, "<placeholder>", 1)
+
+                while "<placeholder>" in sample:
+                    replace_str = replace_strings.pop(0)
+                    sample = sample.replace("<placeholder>", replace_str, 1)
+
+                prompt_strings.append(sample)
+
+        text_inputs = self.tokenizer(
+            prompt_strings,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+        )
+        return BatchMixFeature(data={**text_inputs, **image_inputs})
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index b9ce0d0f15bb..2db7b38b5803 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -7067,6 +7067,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class PixtralModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PixtralPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class PLBartForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 2493954a518b..436378582e54 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -506,6 +506,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class PixtralImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class PoolFormerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 2fed802b5a2f..5c05480ffa6d 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -569,3 +569,50 @@ def test_expansion_in_processing(self):
 
         # check that both inputs are handled correctly and generate the same output
         self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
+
+    @slow
+    @require_bitsandbytes
+    def test_pixtral(self):
+        model_id = "hf-internal-testing/pixtral-12b"
+        model = LlavaForConditionalGeneration.from_pretrained(model_id)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        IMG_URLS = [
+            Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
+        ]
+        PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
+
+        # image = Image.open(requests.get(url, stream=True).raw)
+        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
+        generate_ids = model.generate(**inputs, max_new_tokens=500)
+        ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+        # fmt: off
+        EXPECTED_GENERATION = """
+Describe the images.
+Sure, let's break down each image description:
+
+1. **Image 1:**
+   - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
+   - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.
+
+2. **Image 2:**
+   - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
+   - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.
+
+3. **Image 3:**
+   - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
+   - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.
+
+4. **Image 4:**
+   - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
+   - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
+
+Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
+"""
+        # fmt: on
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(ouptut, EXPECTED_GENERATION)
diff --git a/tests/models/pixtral/__init__.py b/tests/models/pixtral/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/pixtral/test_image_processing_pixtral.py b/tests/models/pixtral/test_image_processing_pixtral.py
new file mode 100644
index 000000000000..3994201c065c
--- /dev/null
+++ b/tests/models/pixtral/test_image_processing_pixtral.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import PixtralImageProcessor
+
+
+class PixtralImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        max_num_images_per_sample=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        patch_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"longest_edge": 24}
+        patch_size = patch_size if patch_size is not None else {"height": 8, "width": 8}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.max_num_images_per_sample = max_num_images_per_sample
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.patch_size = patch_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "patch_size": self.patch_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_image_shape(self, image):
+        if isinstance(image, Image.Image):
+            width, height = image.size
+        elif isinstance(image, np.ndarray):
+            height, width = image.shape[:2]
+        elif isinstance(image, torch.Tensor):
+            height, width = image.shape[-2:]
+
+        max_height = max_width = self.size.get("longest_edge")
+
+        ratio = max(height / max_height, width / max_width)
+        if ratio > 1:
+            height = int(np.ceil(height / ratio))
+            width = int(np.ceil(width / ratio))
+
+        patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+        num_height_tokens = (height - 1) // patch_height + 1
+        num_width_tokens = (width - 1) // patch_width + 1
+
+        height = num_height_tokens * patch_height
+        width = num_width_tokens * patch_width
+
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        # Use prepare_image_inputs to make a list of list of single images
+
+        images_list = []
+        for _ in range(self.batch_size):
+            images = []
+            for _ in range(random.randint(1, self.max_num_images_per_sample)):
+                img = prepare_image_inputs(
+                    batch_size=1,
+                    num_channels=self.num_channels,
+                    min_resolution=self.min_resolution,
+                    max_resolution=self.max_resolution,
+                    equal_resolution=equal_resolution,
+                    numpify=numpify,
+                    torchify=torchify,
+                )[0]
+                images.append(img)
+            images_list.append(images)
+        return images_list
+
+
+@require_torch
+@require_vision
+class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = PixtralImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = PixtralImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "patch_size"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs_list = self.image_processor_tester.prepare_image_inputs()
+        for image_inputs in image_inputs_list:
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
+        self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+
+        # Test batched
+        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
+        for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
+            for encoded_image, image in zip(encoded_images, images):
+                expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
+                self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs_list = self.image_processor_tester.prepare_image_inputs(numpify=True)
+        for image_inputs in image_inputs_list:
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
+        self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+
+        # Test batched
+        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
+        for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
+            for encoded_image, image in zip(encoded_images, images):
+                expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
+                self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs_list = self.image_processor_tester.prepare_image_inputs(torchify=True)
+        for image_inputs in image_inputs_list:
+            for image in image_inputs:
+                self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
+        self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+
+        # Test batched
+        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
+        for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
+            for encoded_image, image in zip(encoded_images, images):
+                expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
+                self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+
+    @unittest.skip(reason="PixtralImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass
diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py
new file mode 100644
index 000000000000..bd41fa1c9e62
--- /dev/null
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -0,0 +1,292 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Pixtral model."""
+
+import gc
+import unittest
+
+import requests
+
+from transformers import (
+    AutoProcessor,
+    PixtralModel,
+    PixtralVisionConfig,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import (
+    require_bitsandbytes,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
+
+
+if is_torch_available():
+    import torch
+else:
+    is_torch_greater_or_equal_than_2_0 = False
+
+if is_vision_available():
+    from PIL import Image
+
+
+class PixtralModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def get_config(self):
+        return PixtralVisionConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values):
+        model = PixtralModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_with_projection(self, config, pixel_values):
+        model = PixtralModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class PixtralModelModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Model tester for `PixtralModel`.
+    """
+
+    all_model_classes = (PixtralModel,) if is_torch_available() else ()
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = PixtralModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=PixtralVisionConfig, has_text_modality=False)
+
+    @unittest.skip("model does not support input embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("model does not support input embeds")
+    def test_inputs_embeds_matches_input_ids(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported because in Pixtral models")
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported because in Pixtral models")
+    def test_sdpa_can_dispatch_on_flash(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_attention_outputs(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_cpu_offload(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_batching_equivalence(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_disk_offload_bin(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_model_parallelism(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_model_outputs_equivalence(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_save_load(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_model_main_input_name(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_initialization(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_gradient_checkpointing_backward_compatibility(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_disk_offload_safetensors(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_determinism(self):
+        pass
+
+
+@require_torch
+class PixtralModelIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained("hf-internal-testing/pixtral-12b")
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model = PixtralModel.from_pretrained("hf-internal-testing/pixtral-12b", load_in_4bit=True)
+
+        prompt = "<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
+        image_file = "https://pixtral-vl.github.io/static/images/view.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = self.processor(prompt, raw_image, return_tensors="pt")
+
+        EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]])  # fmt: skip
+        self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
+
+        output = model.generate(**inputs, max_new_tokens=20)
+        EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,"  # fmt: skip
+
+        self.assertEqual(
+            self.processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
new file mode 100644
index 000000000000..b70cab1c0744
--- /dev/null
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -0,0 +1,233 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import requests
+import torch
+
+from transformers.testing_utils import require_vision
+from transformers.utils import is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import AutoTokenizer, PixtralImageProcessor, PixtralProcessor
+
+
+@require_vision
+class PixtralProcessorTest(unittest.TestCase):
+    processor_class = PixtralProcessor
+
+    @classmethod
+    def setUpClass(cls):
+        cls.url_0 = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        cls.image_0 = Image.open(requests.get(cls.url_0, stream=True).raw)
+        cls.url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        cls.image_1 = Image.open(requests.get(cls.url_1, stream=True).raw)
+        cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+        cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
+
+    def setUp(self):
+        super().setUp()
+
+        # FIXME - just load the processor directly from the checkpoint
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/pixtral-12b")
+        image_processor = PixtralImageProcessor()
+        self.processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+    @unittest.skip("No chat template was set for this model (yet)")
+    def test_chat_template(self):
+        expected_prompt = "USER: [IMG]\nWhat is shown in this image? ASSISTANT:"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        formatted_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+        self.assertEqual(expected_prompt, formatted_prompt)
+
+    @unittest.skip("No chat template was set for this model (yet)")
+    def test_image_token_filling(self):
+        # Important to check with non square image
+        image = torch.randint(0, 2, (3, 500, 316))
+        expected_image_tokens = 1526
+        image_token_index = 32000
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        inputs = self.processor(
+            text=[self.processor.apply_chat_template(messages)],
+            images=[image],
+            return_tensors="pt",
+        )
+        image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
+        self.assertEqual(expected_image_tokens, image_tokens)
+
+    def test_processor_with_single_image(self):
+        prompt_string = "USER: [IMG]\nWhat's the content of the image? ASSISTANT:"
+
+        # Make small for checking image token expansion
+        self.processor.image_processor.size = {"longest_edge": 30}
+        self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+
+        # Test passing in an image
+        inputs_image = self.processor(text=prompt_string, images=self.image_0, return_tensors="pt")
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 1)
+        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], list)
+        self.assertTrue(len(inputs_image["pixel_values"]) == 1)
+        self.assertIsInstance(inputs_image["pixel_values"][0], list)
+        self.assertTrue(len(inputs_image["pixel_values"][0]) == 1)
+        self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_image["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
+            [21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+        )
+        # fmt: on
+
+        # Test passing in a url
+        inputs_url = self.processor(text=prompt_string, images=self.url_0, return_tensors="pt")
+        self.assertIn("input_ids", inputs_url)
+        self.assertTrue(len(inputs_url["input_ids"]) == 1)
+        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_url["pixel_values"], list)
+        self.assertTrue(len(inputs_url["pixel_values"]) == 1)
+        self.assertIsInstance(inputs_url["pixel_values"][0], list)
+        self.assertTrue(len(inputs_url["pixel_values"][0]) == 1)
+        self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_url["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
+            [21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
+        )
+        # fmt: on
+
+    def test_processor_with_multiple_images_single_list(self):
+        prompt_string = "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:"
+
+        # Make small for checking image token expansion
+        self.processor.image_processor.size = {"longest_edge": 30}
+        self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+
+        # Test passing in an image
+        inputs_image = self.processor(text=prompt_string, images=[self.image_0, self.image_1], return_tensors="pt")
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 1)
+        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], list)
+        self.assertTrue(len(inputs_image["pixel_values"]) == 1)
+        self.assertIsInstance(inputs_image["pixel_values"][0], list)
+        self.assertTrue(len(inputs_image["pixel_values"][0]) == 2)
+        self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_image["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on
+
+        # Test passing in a url
+        inputs_url = self.processor(text=prompt_string, images=[self.url_0, self.url_1], return_tensors="pt")
+        self.assertIn("input_ids", inputs_url)
+        self.assertTrue(len(inputs_url["input_ids"]) == 1)
+        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_url["pixel_values"], list)
+        self.assertTrue(len(inputs_url["pixel_values"]) == 1)
+        self.assertIsInstance(inputs_url["pixel_values"][0], list)
+        self.assertTrue(len(inputs_url["pixel_values"][0]) == 2)
+        self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
+        # fmt: off
+        input_ids = inputs_url["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on
+
+    def test_processor_with_multiple_images_multiple_lists(self):
+        prompt_string = [
+            "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:",
+            "USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
+        ]
+        self.processor.tokenizer.pad_token = "</s>"
+        image_inputs = [[self.image_0, self.image_1], [self.image_2]]
+
+        # Make small for checking image token expansion
+        self.processor.image_processor.size = {"longest_edge": 30}
+        self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+
+        # Test passing in an image
+        inputs_image = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 2)
+        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], list)
+        self.assertTrue(len(inputs_image["pixel_values"]) == 2)
+        self.assertIsInstance(inputs_image["pixel_values"][0], list)
+        self.assertTrue(len(inputs_image["pixel_values"][0]) == 2)
+        self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_image["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on
+
+        # Test passing in a url
+        inputs_url = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+        self.assertIn("input_ids", inputs_url)
+        self.assertTrue(len(inputs_url["input_ids"]) == 2)
+        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_url["pixel_values"], list)
+        self.assertTrue(len(inputs_url["pixel_values"]) == 2)
+        self.assertIsInstance(inputs_url["pixel_values"][0], list)
+        self.assertTrue(len(inputs_url["pixel_values"][0]) == 2)
+        self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_url["input_ids"]
+        self.assertEqual(
+            input_ids[0].tolist(),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
+        )
+        # fmt: on

From 95e816f2bca48de32167ce6243e6770dee23923d Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Mon, 16 Sep 2024 09:44:57 +0100
Subject: [PATCH 09/67] Cohere: update RoPE structure (#33408)

---
 .../models/cohere/configuration_cohere.py     |  43 +++++
 .../models/cohere/modeling_cohere.py          | 170 ++++++++++++++----
 src/transformers/models/dbrx/modeling_dbrx.py |   2 +-
 .../models/gemma/modeling_gemma.py            |   2 +-
 .../models/granite/modeling_granite.py        |   2 +-
 .../models/llama/configuration_llama.py       |   2 +-
 .../models/llama/modeling_llama.py            |   2 +-
 .../models/mistral/modeling_mistral.py        |   2 +-
 .../models/mixtral/modeling_mixtral.py        |   2 +-
 src/transformers/models/olmo/modeling_olmo.py |   2 +-
 .../models/olmoe/modeling_olmoe.py            |   2 +-
 .../models/persimmon/modeling_persimmon.py    |   2 +-
 src/transformers/models/phi/modeling_phi.py   |   2 +-
 src/transformers/models/phi3/modeling_phi3.py |   2 +-
 .../models/qwen2/modeling_qwen2.py            |   2 +-
 .../models/qwen2_moe/modeling_qwen2_moe.py    |   2 +-
 .../models/stablelm/modeling_stablelm.py      |   2 +-
 .../models/starcoder2/modeling_starcoder2.py  |   2 +-
 18 files changed, 190 insertions(+), 55 deletions(-)

diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py
index 73973bfad60b..3c1237e51137 100644
--- a/src/transformers/models/cohere/configuration_cohere.py
+++ b/src/transformers/models/cohere/configuration_cohere.py
@@ -20,6 +20,7 @@
 """Cohere model configuration"""
 
 from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
 from ...utils import logging
 
 
@@ -79,6 +80,43 @@ class CohereConfig(PretrainedConfig):
             Whether to tie weight embeddings
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
         attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -121,6 +159,7 @@ def __init__(
         eos_token_id=255001,
         tie_word_embeddings=True,
         rope_theta=10000.0,
+        rope_scaling=None,
         attention_bias=False,
         attention_dropout=0.0,
         use_qk_norm=False,
@@ -144,10 +183,14 @@ def __init__(
         self.layer_norm_eps = layer_norm_eps
         self.use_cache = use_cache
         self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.use_qk_norm = use_qk_norm
 
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index 4010d9ec3a43..ae84a9ec2d1a 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -37,6 +37,7 @@
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 )
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
 from ...utils import (
@@ -135,35 +136,97 @@ def forward(self, hidden_states):
 
 
 class CohereRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+    # Note: the forward pass of this RoPE is slightly different from Llama's, resulting in different `sin`/`cos` for
+    # the same parameterization. The differences are highlighted with a comment.
+
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[CohereConfig] = None,
+    ):
         super().__init__()
-        self.scaling_factor = scaling_factor
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`CohereRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
 
     @torch.no_grad()
     def forward(self, x, position_ids):
-        # x: [bs, num_attention_heads, seq_len, head_size]
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
-
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
         device_type = x.device.type
         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.repeat_interleave(freqs, 2, dim=-1)
+            emb = torch.repeat_interleave(freqs, 2, dim=-1)  # This line differs from Llama's implementation
             cos = emb.cos()
             sin = emb.sin()
-        return cos, sin
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
 def rotate_half(x):
-    # Split and rotate
+    # Split and rotate. Note that this function is different from e.g. Llama.
     x1 = x[..., ::2]
     x2 = x[..., 1::2]
     rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
@@ -272,17 +335,10 @@ def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
         self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
-        self._init_rope()
 
-    # Ignore copy
-    def _init_rope(self):
-        self.rotary_emb = CohereRotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
+        # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
+        self.rotary_emb = CohereRotaryEmbedding(config=self.config)
 
-    # Ignore copy
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -292,6 +348,7 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
@@ -310,7 +367,16 @@ def forward(
         key_states = key_states.transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        cos, sin = self.rotary_emb(value_states, position_ids)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
@@ -350,8 +416,7 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere
-# TODO(joao): add me back asap :)
+# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere
 class CohereFlashAttention2(CohereAttention):
     """
     Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays
@@ -377,6 +442,7 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if isinstance(past_key_value, StaticCache):
@@ -402,7 +468,16 @@ def forward(
         key_states = key_states.transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        cos, sin = self.rotary_emb(value_states, position_ids)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
@@ -418,7 +493,6 @@ def forward(
 
         dropout_rate = self.attention_dropout if self.training else 0.0
 
-        # Ignore copy
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
         # cast them back in the correct dtype just to be sure everything works as expected.
@@ -465,8 +539,6 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere
-# TODO(joao): add me back asap :)
 class CohereSdpaAttention(CohereAttention):
     """
     Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -474,7 +546,6 @@ class CohereSdpaAttention(CohereAttention):
     SDPA API.
     """
 
-    # Ignore copy
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -484,6 +555,7 @@ def forward(
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -517,7 +589,16 @@ def forward(
         key_states = key_states.transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        cos, sin = self.rotary_emb(value_states, position_ids)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
@@ -587,6 +668,7 @@ def forward(
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -601,6 +683,11 @@ def forward(
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
         """
         residual = hidden_states
 
@@ -615,6 +702,7 @@ def forward(
             output_attentions=output_attentions,
             use_cache=use_cache,
             cache_position=cache_position,
+            position_embeddings=position_embeddings,
         )
 
         # Fully Connected
@@ -755,8 +843,7 @@ def _init_weights(self, module):
     "The bare Cohere Model outputting raw hidden-states without any specific head on top.",
     COHERE_START_DOCSTRING,
 )
-# copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere
-# TODO(joao): add me back asap :)
+# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere, LLAMA->COHERE
 class CohereModel(CoherePreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`]
@@ -776,6 +863,7 @@ def __init__(self, config: CohereConfig):
             [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
+        self.rotary_emb = CohereRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
 
         # Initialize weights and apply final processing
@@ -787,14 +875,13 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
-    # Ignore copy
     @add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -823,30 +910,33 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        past_seen_tokens = 0
         return_legacy_cache = False
         if (
             use_cache and not isinstance(past_key_values, Cache) and not self.training
         ):  # kept for BC (non `Cache` `past_key_values` inputs)
             return_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            logger.warning_once(
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
+                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
+            )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
-
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
         causal_mask = self._update_causal_mask(
             attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
         )
-
-        # embed positions
         hidden_states = inputs_embeds
 
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -866,6 +956,7 @@ def forward(
                     output_attentions,
                     use_cache,
                     cache_position,
+                    position_embeddings,
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -876,6 +967,7 @@ def forward(
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                     cache_position=cache_position,
+                    position_embeddings=position_embeddings,
                 )
 
             hidden_states = layer_outputs[0]
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 8db9f6e8b7d0..43bac44ba1be 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1066,7 +1066,7 @@ def forward(
             return_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index 085751cd9bc0..b14e0a4b3d8c 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -862,7 +862,7 @@ def forward(
             return_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
index ff10b6e6d875..876f5ed2a7c8 100644
--- a/src/transformers/models/granite/modeling_granite.py
+++ b/src/transformers/models/granite/modeling_granite.py
@@ -839,7 +839,7 @@ def forward(
             return_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index 435f0091e06e..a3667e065345 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -192,7 +192,7 @@ def __init__(
         self.mlp_bias = mlp_bias
         self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
         # Validate the correctness of rotary position embeddings parameters
-        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
         if self.rope_scaling is not None and "type" in self.rope_scaling:
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 9a1d6c0749f9..c7017832b932 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -951,7 +951,7 @@ def forward(
             return_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index c43418182c38..ffe16b272033 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -767,7 +767,7 @@ def forward(
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             return_legacy_cache = True
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 2e23d0669908..c7062e75b108 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -1023,7 +1023,7 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 007e69570e78..b4bda8e2db52 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -873,7 +873,7 @@ def forward(
             return_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index a53f1eeda611..a33338365312 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -1012,7 +1012,7 @@ def forward(
             return_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 9fab09bdcc78..ccaa2c7fd29a 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -690,7 +690,7 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 0d8be04af20d..648d1653a3b5 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -981,7 +981,7 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index 273b6a8f505e..ec395679ae62 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -1008,7 +1008,7 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 030c74b034b7..d0ea8ef0e376 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -920,7 +920,7 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index b196ed72a49b..6f483e50cde0 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -1084,7 +1084,7 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index 27d0c856a61b..d91c0832ed33 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -965,7 +965,7 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index c359c07c69c0..0be37c4e1fb9 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -894,7 +894,7 @@ def forward(
             use_legacy_cache = True
             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
             logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
+                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
                 "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
             )
 

From 5ce0a113b5bc9dd8dbb92dd866772d79847d9a92 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:07:59 +0200
Subject: [PATCH 10/67] Fix SSH workflow (#33451)

* fix

* update

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/ssh-runner.yml | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
index b433abb484fa..db649876f604 100644
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -58,8 +58,19 @@ jobs:
         #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
         shell: bash
         run: |
-          if [ "${{ secrets[format('{0}_{1}', github.actor, 'SLACK_ID')] }}" != "" ]; then
-            echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', github.actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
+          echo "${{ github.actor }}"
+          github_actor=${{ github.actor }}
+          github_actor=${github_actor/'-'/'_'}
+          echo "$github_actor"
+          echo "github_actor=$github_actor" >> $GITHUB_ENV
+
+      - name: Store Slack infos
+        #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
+        shell: bash
+        run: |
+          echo "${{ env.github_actor }}"
+          if [ "${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" != "" ]; then
+            echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
           else
             echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
           fi

From ce62a41880b5b70a304d068eb58f55894a5a7af8 Mon Sep 17 00:00:00 2001
From: Merve Noyan <merveenoyan@gmail.com>
Date: Mon, 16 Sep 2024 13:08:31 +0200
Subject: [PATCH 11/67] Add keypoint-detection task guide (#33274)

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/_toctree.yml                |   2 +
 docs/source/en/tasks/keypoint_detection.md | 154 +++++++++++++++++++++
 2 files changed, 156 insertions(+)
 create mode 100644 docs/source/en/tasks/keypoint_detection.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 235ea81a7f1e..7eff2a383026 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -81,6 +81,8 @@
       title: Image Feature Extraction
     - local: tasks/mask_generation
       title: Mask Generation
+    - local: tasks/keypoint_detection
+      title: Keypoint Detection
     - local: tasks/knowledge_distillation_for_image_classification
       title: Knowledge Distillation for Computer Vision
     title: Computer Vision
diff --git a/docs/source/en/tasks/keypoint_detection.md b/docs/source/en/tasks/keypoint_detection.md
new file mode 100644
index 000000000000..a0ec71a5c220
--- /dev/null
+++ b/docs/source/en/tasks/keypoint_detection.md
@@ -0,0 +1,154 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Keypoint Detection
+
+[[open-in-colab]]
+
+Keypoint detection identifies and locates specific points of interest within an image. These keypoints, also known as landmarks, represent meaningful features of objects, such as facial features or object parts. These models take an image input and return the following outputs: 
+
+- **Keypoints and Scores**: Points of interest and their confidence scores.
+- **Descriptors**: A representation of the image region surrounding each keypoint, capturing its texture, gradient, orientation and other properties.
+
+In this guide, we will show how to extract keypoints from images.
+
+For this tutorial, we will use [SuperPoint](./model_doc/superpoint.md), a foundation model for keypoint detection.
+
+```python
+from transformers import AutoImageProcessor, SuperPointForKeypointDetection
+processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
+model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+```
+
+Let's test the model on the images below.
+
+<div style="display: flex; align-items: center;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg" 
+         alt="Bee" 
+         style="height: 200px; object-fit: contain; margin-right: 10px;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png" 
+         alt="Cats" 
+         style="height: 200px; object-fit: contain;">
+</div>
+
+
+```python
+import torch
+from PIL import Image
+import requests
+import cv2
+
+
+url_image_1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
+image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
+url_image_2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"
+image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
+
+images = [image_1, image_2]
+```
+
+We can now process our inputs and infer.
+
+```python
+inputs = processor(images,return_tensors="pt").to(model.device, model.dtype)
+outputs = model(**inputs)
+```
+
+The model output has relative keypoints, descriptors, masks and scores for each item in the batch. The mask highlights areas of the image where keypoints are present.
+
+```python
+SuperPointKeypointDescriptionOutput(loss=None, keypoints=tensor([[[0.0437, 0.0167],
+         [0.0688, 0.0167],
+         [0.0172, 0.0188],
+         ...,
+         [0.5984, 0.9812],
+         [0.6953, 0.9812]]]), 
+         scores=tensor([[0.0056, 0.0053, 0.0079,  ..., 0.0125, 0.0539, 0.0377],
+        [0.0206, 0.0058, 0.0065,  ..., 0.0000, 0.0000, 0.0000]],
+       grad_fn=<CopySlices>), descriptors=tensor([[[-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+         [-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+         [-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+         ...],
+       grad_fn=<CopySlices>), mask=tensor([[1, 1, 1,  ..., 1, 1, 1],
+        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32), hidden_states=None)
+```
+
+To plot actual keypoints in the image, we need to postprocess the output. To do so, we have to pass the actual image sizes to `post_process_keypoint_detection` along with outputs.
+
+```python
+image_sizes = [(image.size[1], image.size[0]) for image in images]
+outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
+```
+
+The outputs are now a list of dictionaries where each dictionary is a processed output of keypoints, scores and descriptors. 
+
+```python
+[{'keypoints': tensor([[ 226,   57],
+          [ 356,   57],
+          [  89,   64],
+          ...,
+          [3604, 3391]], dtype=torch.int32),
+  'scores': tensor([0.0056, 0.0053, ...], grad_fn=<IndexBackward0>),
+  'descriptors': tensor([[-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+          [-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357]],
+         grad_fn=<IndexBackward0>)},
+    {'keypoints': tensor([[ 46,   6],
+          [ 78,   6],
+          [422,   6],
+          [206, 404]], dtype=torch.int32),
+  'scores': tensor([0.0206, 0.0058, 0.0065, 0.0053, 0.0070, ...,grad_fn=<IndexBackward0>),
+  'descriptors': tensor([[-0.0525,  0.0726,  0.0270,  ...,  0.0389, -0.0189, -0.0211],
+          [-0.0525,  0.0726,  0.0270,  ...,  0.0389, -0.0189, -0.0211]}]
+```
+
+We can use these to plot the keypoints.
+
+```python
+import matplotlib.pyplot as plt
+import torch
+
+for i in range(len(images)):
+  keypoints = outputs[i]["keypoints"]
+  scores = outputs[i]["scores"]
+  descriptors = outputs[i]["descriptors"]
+  keypoints = outputs[i]["keypoints"].detach().numpy()
+  scores = outputs[i]["scores"].detach().numpy()
+  image = images[i]
+  image_width, image_height = image.size
+
+  plt.axis('off')
+  plt.imshow(image)
+  plt.scatter(
+      keypoints[:, 0],
+      keypoints[:, 1],
+      s=scores * 100,
+      c='cyan',
+      alpha=0.4
+  )
+  plt.show()
+```
+
+Below you can see the outputs.
+
+<div style="display: flex; align-items: center;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee_keypoint.png" 
+         alt="Bee" 
+         style="height: 200px; object-fit: contain; margin-right: 10px;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats_keypoint.png" 
+         alt="Cats" 
+         style="height: 200px; object-fit: contain;">
+</div>
+

From 2f62146f0e916c3e6752b59d34853be6df0506f2 Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Mon, 16 Sep 2024 11:26:26 -0400
Subject: [PATCH 12/67] Uniformize kwargs for LLaVa processor and update docs
 (#32858)

* Uniformize kwargs for LlaVa and update docs

* Change order of processor inputs in docstring

* Improve BC support for reversed images and text inputs

* cleanup llava processor call docstring

* Add encoded inputs as valid text inputs in reverse input check, add deprecation version in warning

* Put function check reversed images text outside base processor class

* Refactor _validate_images_text_input_order

* Add ProcessingUtilTester

* fix processing and test_processing
---
 .../models/llava/modeling_llava.py            |  2 +-
 .../models/llava/processing_llava.py          | 73 ++++++++++---------
 tests/models/llava/test_modeling_llava.py     | 20 ++---
 tests/models/llava/test_processor_llava.py    | 57 ++++++++++++++-
 4 files changed, 104 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 9ad19ccee722..eb1c55341b07 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -405,7 +405,7 @@ def forward(
         >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
 
         >>> # Generate
         >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 678724ae95be..28a9410e6cbf 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -16,18 +16,33 @@
 Processor class for Llava.
 """
 
-from typing import List, Optional, Union
+import sys
+from typing import List, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, get_image_size, to_numpy_array
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, _validate_images_text_input_order
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
 
 
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
 logger = logging.get_logger(__name__)
 
 
+class LlavaProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {},
+    }
+
+
 class LlavaProcessor(ProcessorMixin):
     r"""
     Constructs a Llava processor which wraps a Llava image processor and a Llava tokenizer into a single processor.
@@ -73,12 +88,11 @@ def __init__(
 
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         images: ImageInput = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length=None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[LlavaProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -88,29 +102,15 @@ def __call__(
         of the above two methods for more information.
 
         Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
             text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
-
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
                 - `'np'`: Return NumPy `np.ndarray` objects.
@@ -125,8 +125,19 @@ def __call__(
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of `images` or `text`.")
+
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        output_kwargs = self._merge_kwargs(
+            LlavaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
         if images is not None:
-            image_inputs = self.image_processor(images, return_tensors=return_tensors)
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         else:
             image_inputs = {}
 
@@ -158,13 +169,7 @@ def __call__(
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                 )
 
-        text_inputs = self.tokenizer(
-            prompt_strings,
-            return_tensors=return_tensors,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-        )
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
         return BatchFeature(data={**text_inputs, **image_inputs})
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 5c05480ffa6d..305fc9e9a84c 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -274,7 +274,7 @@ def test_small_model_integration_test(self):
         prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
         image_file = "https://llava-vl.github.io/static/images/view.jpg"
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = self.processor(prompt, raw_image, return_tensors="pt")
+        inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt")
 
         EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]])  # fmt: skip
         self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
@@ -299,7 +299,7 @@ def test_small_model_integration_test_llama_single(self):
         prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
         image_file = "https://llava-vl.github.io/static/images/view.jpg"
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
 
         output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
         EXPECTED_DECODED_TEXT = "USER:  \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area."  # fmt: skip
@@ -325,7 +325,7 @@ def test_small_model_integration_test_llama_batched(self):
         image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
         image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
 
-        inputs = processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
+        inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
 
         output = model.generate(**inputs, max_new_tokens=20)
 
@@ -349,7 +349,7 @@ def test_small_model_integration_test_batch(self):
         image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
         image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
 
-        inputs = self.processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
+        inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
 
         output = model.generate(**inputs, max_new_tokens=20)
 
@@ -381,7 +381,7 @@ def test_small_model_integration_test_llama_batched_regression(self):
         image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
         image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
 
-        inputs = processor(prompts, images=[image1, image2, image1], return_tensors="pt", padding=True)
+        inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True)
 
         output = model.generate(**inputs, max_new_tokens=20)
 
@@ -409,8 +409,8 @@ def test_batched_generation(self):
         image2 = Image.open(requests.get(url2, stream=True).raw)
 
         inputs = processor(
-            text=[prompt1, prompt2, prompt3],
             images=[image1, image2, image1, image2],
+            text=[prompt1, prompt2, prompt3],
             return_tensors="pt",
             padding=True,
         ).to(torch_device)
@@ -444,7 +444,7 @@ def test_llava_index_error_bug(self):
         image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
 
         # Make sure that `generate` works
         _ = model.generate(**inputs, max_new_tokens=20)
@@ -510,7 +510,7 @@ def test_generation_no_images(self):
         processor = AutoProcessor.from_pretrained(model_id)
 
         # Prepare inputs with no images
-        inputs = processor("Hello, I am", return_tensors="pt").to(torch_device)
+        inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device)
 
         # Make sure that `generate` works
         _ = model.generate(**inputs, max_new_tokens=20)
@@ -554,13 +554,13 @@ def test_expansion_in_processing(self):
         # check processing with expansion of inputs
         processor.vision_feature_select_strategy = "default"
         processor.patch_size = 14
-        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
 
         # check processing without expansion of inputs (legacy behavior)
         processor.vision_feature_select_strategy = None
         processor.patch_size = None
-        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
         self.assertTrue(inputs.input_ids.shape[-1] == 18)
 
         # generate exactly 20 tokens
diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py
index 54c1b4674cbc..5b05a8b92ea5 100644
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -11,18 +11,43 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import shutil
+import tempfile
 import unittest
 
-from transformers.testing_utils import require_vision
+from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
-    from transformers import AutoTokenizer, LlavaProcessor
+    from transformers import CLIPImageProcessor
 
 
 @require_vision
-class LlavaProcessorTest(unittest.TestCase):
+class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = LlavaProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = CLIPImageProcessor(do_center_crop=False)
+        tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
+
+        processor = LlavaProcessor(image_processor=image_processor, tokenizer=tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
     def test_can_load_various_tokenizers(self):
         for checkpoint in ["Intel/llava-gemma-2b", "llava-hf/llava-1.5-7b-hf"]:
             processor = LlavaProcessor.from_pretrained(checkpoint)
@@ -45,3 +70,29 @@ def test_chat_template(self):
 
         formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
         self.assertEqual(expected_prompt, formatted_prompt)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            images=image_input,
+            text=input_str,
+            return_tensors="pt",
+            size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 5)

From c7a91f5adf976e0517c4a7f1506fb0c24f353053 Mon Sep 17 00:00:00 2001
From: Sergio Paniego Blanco <sergiopaniegoblanco@gmail.com>
Date: Mon, 16 Sep 2024 18:52:27 +0200
Subject: [PATCH 13/67] `Agents, supercharged - Multi-agents, External tools,
 and more` docs typo fixed (#33478)

* Typo fixed in Agents, supercharged
---
 docs/source/en/agents_advanced.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md
index e7469a310c41..399eeb9b70eb 100644
--- a/docs/source/en/agents_advanced.md
+++ b/docs/source/en/agents_advanced.md
@@ -34,7 +34,7 @@ You can easily build hierarchical multi-agent systems with `transformers.agents`
 
 To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools.
 
-Here's an example of making an agent that managed a specitif web search agent using our [`DuckDuckGoSearchTool`]:
+Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]:
 
 ```py
 from transformers.agents import ReactCodeAgent, HfApiEngine, DuckDuckGoSearchTool, ManagedAgent

From c2d05897bf4e8b34773838accaddd66028bc148d Mon Sep 17 00:00:00 2001
From: Ahmed Almaghz <53489256+AhmedAlmaghz@users.noreply.github.com>
Date: Mon, 16 Sep 2024 20:02:03 +0300
Subject: [PATCH 14/67] [i18n-ar] Add File : `docs/source/ar/_toctree.yml` 
 (#32696)

* Update ar lang build_documentation.yml

* Update ar lang build_pr_documentation.yml

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/pipeline_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/autoclass_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/preprocessing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/training.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/run_scripts.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/run_scripts.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/run_scripts.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/run_scripts.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/run_scripts.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/run_scripts.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/run_scripts.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/accelerate.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/accelerate.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/accelerate.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/accelerate.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/accelerate.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/accelerate.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Create _config.py

* Update _toctree.yml

* Update _toctree.yml

* Update docs/source/ar/peft.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/peft.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/peft.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/peft.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/peft.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/peft.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/peft.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/peft.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/peft.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update _toctree.yml

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/model_sharing.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/conversations.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/agents.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update docs/source/ar/llm_tutorial.md

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>

* Update llm_tutorial.md

* Update _toctree.yml

* Update autoclass_tutorial.md

* Update autoclass_tutorial.md

* Update preprocessing.md

* Update glossary.md

* Update run_scripts.md

* Update run_scripts.md

* Update run_scripts.md

---------

Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com>
---
 .github/workflows/build_documentation.yml    |   2 +-
 .github/workflows/build_pr_documentation.yml |   2 +-
 docs/source/ar/_config.py                    |  14 +
 docs/source/ar/_toctree.yml                  | 892 +++++++++++++++++++
 docs/source/ar/accelerate.md                 | 120 +++
 docs/source/ar/agents.md                     | 539 +++++++++++
 docs/source/ar/autoclass_tutorial.md         | 167 ++++
 docs/source/ar/conversations.md              | 204 +++++
 docs/source/ar/glossary.md                   | 446 ++++++++++
 docs/source/ar/index.md                      | 342 +++++++
 docs/source/ar/installation.md               | 246 +++++
 docs/source/ar/llm_tutorial.md               | 248 ++++++
 docs/source/ar/model_sharing.md              | 223 +++++
 docs/source/ar/peft.md                       | 250 ++++++
 docs/source/ar/pipeline_tutorial.md          | 315 +++++++
 docs/source/ar/preprocessing.md              | 521 +++++++++++
 docs/source/ar/quicktour.md                  | 543 +++++++++++
 docs/source/ar/run_scripts.md                | 351 ++++++++
 docs/source/ar/training.md                   | 412 +++++++++
 19 files changed, 5835 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/ar/_config.py
 create mode 100644 docs/source/ar/_toctree.yml
 create mode 100644 docs/source/ar/accelerate.md
 create mode 100644 docs/source/ar/agents.md
 create mode 100644 docs/source/ar/autoclass_tutorial.md
 create mode 100644 docs/source/ar/conversations.md
 create mode 100644 docs/source/ar/glossary.md
 create mode 100644 docs/source/ar/index.md
 create mode 100644 docs/source/ar/installation.md
 create mode 100644 docs/source/ar/llm_tutorial.md
 create mode 100644 docs/source/ar/model_sharing.md
 create mode 100644 docs/source/ar/peft.md
 create mode 100644 docs/source/ar/pipeline_tutorial.md
 create mode 100644 docs/source/ar/preprocessing.md
 create mode 100644 docs/source/ar/quicktour.md
 create mode 100644 docs/source/ar/run_scripts.md
 create mode 100644 docs/source/ar/training.md

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index e3e3b5f2df37..b25567fb092a 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -15,7 +15,7 @@ jobs:
       commit_sha: ${{ github.sha }}
       package: transformers
       notebook_folder: transformers_doc
-      languages: de en es fr hi it ko pt tr zh ja te
+      languages: ar de en es fr hi it ko pt tr zh ja te
       custom_container: huggingface/transformers-doc-builder
     secrets:
       token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index c8d073ea3468..f698f860b2f9 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -14,5 +14,5 @@ jobs:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: transformers
-      languages: de en es fr hi it ko pt tr zh ja te
+      languages: ar de en es fr hi it ko pt tr zh ja te
       custom_container: huggingface/transformers-doc-builder
diff --git a/docs/source/ar/_config.py b/docs/source/ar/_config.py
new file mode 100644
index 000000000000..f49e4e473196
--- /dev/null
+++ b/docs/source/ar/_config.py
@@ -0,0 +1,14 @@
+# docstyle-ignore
+INSTALL_CONTENT = """
+# Transformers installation
+! pip install transformers datasets evaluate accelerate
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/transformers.git
+"""
+
+notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
+black_avoid_patterns = {
+    "{processor_class}": "FakeProcessorClass",
+    "{model_class}": "FakeModelClass",
+    "{object_class}": "FakeObjectClass",
+}
diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml
new file mode 100644
index 000000000000..39e0ae14e19c
--- /dev/null
+++ b/docs/source/ar/_toctree.yml
@@ -0,0 +1,892 @@
+- sections:
+  - local: index
+    title: 🤗 المحولات
+  - local: quicktour
+    title: جولة سريعة
+  - local: installation
+    title: التثبيت
+  title: البدء
+- sections:
+  - local: pipeline_tutorial
+    title: تشغيل الاستنتاج باستخدام خطوط الأنابيب
+  - local: autoclass_tutorial
+    title: كتابة تعليمات برمجية متكيفه باستخدام AutoClass
+  - local: preprocessing
+    title: معالجة البيانات مسبقًا
+  - local: training
+    title: ضبط نموذج مسبق التدريب
+  - local: run_scripts
+    title: التدريب باستخدام نص برمجي
+  - local: accelerate
+    title: إعداد تدريب موزع باستخدام 🤗 Accelerate
+  - local: peft
+    title: تحميل النماذج المخصصة وتدريبها باستخدام 🤗 PEFT
+  - local: model_sharing
+    title: مشاركة نموذجك
+  - local: agents
+    title: الوكلاء
+  - local: llm_tutorial
+    title: التوليد باستخدام LLMs
+  - local: conversations
+    title: الدردشة مع المحولات
+  title: البرامج التعليمية
+# - sections:
+#   - isExpanded: false
+#     sections:
+#     - local: tasks/sequence_classification
+#       title: تصنيف النصوص
+#     - local: tasks/token_classification
+#       title: تصنيف الرموز
+#     - local: tasks/question_answering
+#       title: الإجابة على الأسئلة
+#     - local: tasks/language_modeling
+#       title: نمذجة اللغة السببية
+#     - local: tasks/masked_language_modeling
+#       title: نمذجة اللغة المقنعة
+#     - local: tasks/translation
+#       title: الترجمة
+#     - local: tasks/summarization
+#       title: التلخيص
+#     - local: tasks/multiple_choice
+#       title: الاختيار المتعدد
+#     title: معالجة اللغات الطبيعية
+#   - isExpanded: false
+#     sections:
+#     - local: tasks/audio_classification
+#       title: تصنيف الصوت
+#     - local: tasks/asr
+#       title: التعرف التلقائي على الكلام
+#     title: الصوت
+#   - isExpanded: false
+#     sections:
+#     - local: tasks/image_classification
+#       title: تصنيف الصور
+#     - local: tasks/semantic_segmentation
+#       title: تجزئة الصور
+#     - local: tasks/video_classification
+#       title: تصنيف الفيديو
+#     - local: tasks/object_detection
+#       title: اكتشاف الأشياء
+#     - local: tasks/zero_shot_object_detection
+#       title: اكتشاف الأشياء بدون تدريب
+#     - local: tasks/zero_shot_image_classification
+#       title: تصنيف الصور بدون تدريب
+#     - local: tasks/monocular_depth_estimation
+#       title: تقدير العمق
+#     - local: tasks/image_to_image
+#       title: صورة إلى صورة
+#     - local: tasks/image_feature_extraction
+#       title: استخراج ميزات الصورة
+#     - local: tasks/mask_generation
+#       title: توليد القناع
+#     - local: tasks/knowledge_distillation_for_image_classification
+#       title: التقليل المعرفي للرؤية الحاسوبية
+#     title: الرؤية الحاسوبية
+#   - isExpanded: false
+#     sections:
+#     - local: tasks/image_captioning
+#       title: وصف الصور Image captioning
+#     - local: tasks/document_question_answering
+#       title: الإجابة على أسئلة المستندات
+#     - local: tasks/visual_question_answering
+#       title: الإجابة على الأسئلة المرئية
+#     - local: tasks/text-to-speech
+#       title: تحويل النص إلى كلام
+#     title: المتعددة الوسائط
+#   - isExpanded: false
+#     sections:
+#     - local: generation_strategies
+#       title: تخصيص استراتيجية التوليد
+#     - local: kv_cache
+#       title: أفضل الممارسات للتوليد باستخدام ذاكرة التخزين المؤقت
+#     title: التوليد
+#   - isExpanded: false
+#     sections:
+#     - local: tasks/idefics
+#       title: مهام الصور مع IDEFICS
+#     - local: tasks/prompting
+#       title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة
+#     title: الإرشاد
+#   title: أدلة المهام
+# - sections:
+#   - local: fast_tokenizers
+#     title: استخدم برامج التجزئة السريعة من 🤗 Tokenizers
+#   - local: multilingual
+#     title: تشغيل الاستنتاج باستخدام نماذج متعددة اللغات
+#   - local: create_a_model
+#     title: استخدام واجهات برمجة التطبيقات الخاصة بالنموذج
+#   - local: custom_models
+#     title: مشاركة نموذج مخصص
+#   - local: chat_templating
+#     title: قوالب لنماذج الدردشة
+#   - local: trainer
+#     title: المدرب
+#   - local: sagemaker
+#     title: تشغيل التدريب على Amazon SageMaker
+#   - local: serialization
+#     title: التصدير إلى ONNX
+#   - local: tflite
+#     title: التصدير إلى TFLite
+#   - local: torchscript
+#     title: التصدير إلى TorchScript
+#   - local: benchmarks
+#     title: المعايير
+#   - local: notebooks
+#     title: دفاتر الملاحظات مع الأمثلة
+#   - local: community
+#     title: موارد المجتمع
+#   - local: troubleshooting
+#     title: استكشاف الأخطاء وإصلاحها
+#   - local: gguf
+#     title: التوافق مع ملفات GGUF
+#   title: أدلة المطورين
+# - sections:
+#   - local: quantization/overview
+#     title: نظرة عامة
+#   - local: quantization/bitsandbytes
+#     title: bitsandbytes
+#   - local: quantization/gptq
+#     title: GPTQ
+#   - local: quantization/awq
+#     title: AWQ
+#   - local: quantization/aqlm
+#     title: AQLM
+#   - local: quantization/quanto
+#     title: Quanto
+#   - local: quantization/eetq
+#     title: EETQ
+#   - local: quantization/hqq
+#     title: HQQ
+#   - local: quantization/optimum
+#     title: Optimum
+#   - local: quantization/contribute
+#     title: المساهمة بطريقة جديدة للتكميم
+#   title: أساليب التكميم
+# - sections:
+#   - local: performance
+#     title: الأداء-نظرة عامة
+#   - local: llm_optims
+#     title: تحسين الاستدلال LLM
+#   - sections:
+#     - local: perf_train_gpu_one
+#       title: استخدام عدة وحدات معالجة رسوميات (GPUs) بشكل متوازٍ
+#     - local: perf_train_gpu_many
+#       title: وحدات معالجة الرسومات (GPU) متعددة والتوازي
+#     - local: fsdp
+#       title: Fully Sharded Data Parallel
+#     - local: deepspeed
+#       title: DeepSpeed
+#     - local: perf_train_cpu
+#       title: التدريب الفعال على وحدة المعالجة المركزية (CPU)
+#     - local: perf_train_cpu_many
+#       title: التدريب الموزع لوحدة المعالجة المركزية (CPU)
+#     - local: perf_train_tpu_tf
+#       title: التدريب على (TPU) باستخدام TensorFlow
+#     - local: perf_train_special
+#       title: تدريب PyTorch على Apple silicon
+#     - local: perf_hardware
+#       title: الأجهزة المخصصة للتدريب
+#     - local: hpo_train
+#       title: البحث عن المعاملات المثلى باستخدام واجهة برمجة تطبيقات المدرب
+#     title: تقنيات التدريب الفعال
+#   - sections:
+#     - local: perf_infer_cpu
+#       title: الإستدلال على وحدة المعالجة المركزية (CPU)
+#     - local: perf_infer_gpu_one
+#       title: الإستدلال على وحدة معالجة الرسومات (GPU)
+#     title: تحسين الاستدلال
+#   - local: big_models
+#     title: إنشاء نموذج كبير
+#   - local: debugging
+#     title: تصحيح الأخطاء البرمجية
+#   - local: tf_xla
+#     title: تكامل XLA لنماذج TensorFlow
+#   - local: perf_torch_compile
+#     title: تحسين الاستدلال باستخدام `torch.compile()`
+#   title: الأداء وقابلية التوسع
+# - sections:
+#   - local: contributing
+#     title: كيفية المساهمة في 🤗 المحولات؟
+#   - local: add_new_model
+#     title: كيفية إضافة نموذج إلى 🤗 المحولات؟
+#   - local: add_new_pipeline
+#     title: كيفية إضافة خط أنابيب إلى 🤗 المحولات؟
+#   - local: testing
+#     title: الاختبار
+#   - local: pr_checks
+#     title: التحقق من طلب السحب
+#   title: المساهمة
+- sections:
+  # - local: philosophy
+  #   title: الفلسفة
+  - local: glossary
+    title: (قاموس المصطلحات (قائمة الكلمات
+  # - local: task_summary
+  #   title: ما الذي يمكن أن تفعله 🤗 المحولات
+  # - local: tasks_explained
+  #   title: كيف تحل المحولات المهام
+  # - local: model_summary
+  #   title: عائلة نماذج المحول
+  # - local: tokenizer_summary
+  #   title: ملخص برنامج مقسم النصوص (tokenizers)
+  # - local: attention
+  #   title: الانتباه Attention
+  # - local: pad_truncation
+  #   title: الحشو والتقليم
+  # - local: bertology
+  #   title: BERTology
+  # - local: perplexity
+  #   title: حيرة النماذج ذات الطول الثابت
+  # - local: pipeline_webserver
+  #   title: خطوط الأنابيب للاستدلال على خادم الويب
+  # - local: model_memory_anatomy
+  #   title: تشريح تدريب النموذج
+  # - local: llm_tutorial_optimization
+  #   title: الاستفادة القصوى من LLMs
+  title: أطر مفاهيمية
+# - sections:
+#   - sections:
+#     - local: main_classes/agent
+#       title: الوكلاء والأدوات
+#     - local: model_doc/auto
+#       title: فئات يتم إنشاؤها ديناميكيًا
+#     - local: main_classes/backbones
+#       title: العمود الفقري
+#     - local: main_classes/callback
+#       title: عمليات الاسترجاع
+#     - local: main_classes/configuration
+#       title: التكوين
+#     - local: main_classes/data_collator
+#       title: مجمع البيانات
+#     - local: main_classes/keras_callbacks
+#       title: استدعاءات Keras
+#     - local: main_classes/logging
+#       title: التسجيل
+#     - local: main_classes/model
+#       title: النماذج
+#     - local: main_classes/text_generation
+#       title: توليد النصوص
+#     - local: main_classes/onnx
+#       title: ONNX
+#     - local: main_classes/optimizer_schedules
+#       title: التحسين
+#     - local: main_classes/output
+#       title: مخرجات النموذج
+#     - local: main_classes/pipelines
+#       title: خطوط الأنابيب
+#     - local: main_classes/processors
+#       title: المعالجات
+#     - local: main_classes/quantization
+#       title: التكميم
+#     - local: main_classes/tokenizer
+#       title: برنامج مقسم النصوص
+#     - local: main_classes/trainer
+#       title: المدرب
+#     - local: main_classes/deepspeed
+#       title: DeepSpeed
+#     - local: main_classes/feature_extractor
+#       title: مستخرج الميزات
+#     - local: main_classes/image_processor
+#       title: معالج الصور
+#     title: الفئات الرئيسية
+#   - sections:
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/albert
+#         title: ALBERT
+#       - local: model_doc/bart
+#         title: BART
+#       - local: model_doc/barthez
+#         title: BARThez
+#       - local: model_doc/bartpho
+#         title: BARTpho
+#       - local: model_doc/bert
+#         title: BERT
+#       - local: model_doc/bert-generation
+#         title: BertGeneration
+#       - local: model_doc/bert-japanese
+#         title: BertJapanese
+#       - local: model_doc/bertweet
+#         title: Bertweet
+#       - local: model_doc/big_bird
+#         title: BigBird
+#       - local: model_doc/bigbird_pegasus
+#         title: BigBirdPegasus
+#       - local: model_doc/biogpt
+#         title: BioGpt
+#       - local: model_doc/blenderbot
+#         title: Blenderbot
+#       - local: model_doc/blenderbot-small
+#         title: Blenderbot Small
+#       - local: model_doc/bloom
+#         title: BLOOM
+#       - local: model_doc/bort
+#         title: BORT
+#       - local: model_doc/byt5
+#         title: ByT5
+#       - local: model_doc/camembert
+#         title: CamemBERT
+#       - local: model_doc/canine
+#         title: CANINE
+#       - local: model_doc/codegen
+#         title: CodeGen
+#       - local: model_doc/code_llama
+#         title: CodeLlama
+#       - local: model_doc/cohere
+#         title: Cohere
+#       - local: model_doc/convbert
+#         title: ConvBERT
+#       - local: model_doc/cpm
+#         title: CPM
+#       - local: model_doc/cpmant
+#         title: CPMANT
+#       - local: model_doc/ctrl
+#         title: CTRL
+#       - local: model_doc/dbrx
+#         title: DBRX
+#       - local: model_doc/deberta
+#         title: DeBERTa
+#       - local: model_doc/deberta-v2
+#         title: DeBERTa-v2
+#       - local: model_doc/dialogpt
+#         title: DialoGPT
+#       - local: model_doc/distilbert
+#         title: DistilBERT
+#       - local: model_doc/dpr
+#         title: DPR
+#       - local: model_doc/electra
+#         title: ELECTRA
+#       - local: model_doc/encoder-decoder
+#         title: Encoder Decoder Models
+#       - local: model_doc/ernie
+#         title: ERNIE
+#       - local: model_doc/ernie_m
+#         title: ErnieM
+#       - local: model_doc/esm
+#         title: ESM
+#       - local: model_doc/falcon
+#         title: Falcon
+#       - local: model_doc/fastspeech2_conformer
+#         title: FastSpeech2Conformer
+#       - local: model_doc/flan-t5
+#         title: FLAN-T5
+#       - local: model_doc/flan-ul2
+#         title: FLAN-UL2
+#       - local: model_doc/flaubert
+#         title: FlauBERT
+#       - local: model_doc/fnet
+#         title: FNet
+#       - local: model_doc/fsmt
+#         title: FSMT
+#       - local: model_doc/funnel
+#         title: Funnel Transformer
+#       - local: model_doc/fuyu
+#         title: Fuyu
+#       - local: model_doc/gemma
+#         title: Gemma
+#       - local: model_doc/openai-gpt
+#         title: GPT
+#       - local: model_doc/gpt_neo
+#         title: GPT Neo
+#       - local: model_doc/gpt_neox
+#         title: GPT NeoX
+#       - local: model_doc/gpt_neox_japanese
+#         title: GPT NeoX Japanese
+#       - local: model_doc/gptj
+#         title: GPT-J
+#       - local: model_doc/gpt2
+#         title: GPT2
+#       - local: model_doc/gpt_bigcode
+#         title: GPTBigCode
+#       - local: model_doc/gptsan-japanese
+#         title: GPTSAN Japanese
+#       - local: model_doc/gpt-sw3
+#         title: GPTSw3
+#       - local: model_doc/herbert
+#         title: HerBERT
+#       - local: model_doc/ibert
+#         title: I-BERT
+#       - local: model_doc/jamba
+#         title: Jamba
+#       - local: model_doc/jetmoe
+#         title: JetMoe
+#       - local: model_doc/jukebox
+#         title: Jukebox
+#       - local: model_doc/led
+#         title: LED
+#       - local: model_doc/llama
+#         title: LLaMA
+#       - local: model_doc/llama2
+#         title: Llama2
+#       - local: model_doc/llama3
+#         title: Llama3
+#       - local: model_doc/longformer
+#         title: Longformer
+#       - local: model_doc/longt5
+#         title: LongT5
+#       - local: model_doc/luke
+#         title: LUKE
+#       - local: model_doc/m2m_100
+#         title: M2M100
+#       - local: model_doc/madlad-400
+#         title: MADLAD-400
+#       - local: model_doc/mamba
+#         title: Mamba
+#       - local: model_doc/marian
+#         title: MarianMT
+#       - local: model_doc/markuplm
+#         title: MarkupLM
+#       - local: model_doc/mbart
+#         title: MBart and MBart-50
+#       - local: model_doc/mega
+#         title: MEGA
+#       - local: model_doc/megatron-bert
+#         title: MegatronBERT
+#       - local: model_doc/megatron_gpt2
+#         title: MegatronGPT2
+#       - local: model_doc/mistral
+#         title: Mistral
+#       - local: model_doc/mixtral
+#         title: Mixtral
+#       - local: model_doc/mluke
+#         title: mLUKE
+#       - local: model_doc/mobilebert
+#         title: MobileBERT
+#       - local: model_doc/mpnet
+#         title: MPNet
+#       - local: model_doc/mpt
+#         title: MPT
+#       - local: model_doc/mra
+#         title: MRA
+#       - local: model_doc/mt5
+#         title: MT5
+#       - local: model_doc/mvp
+#         title: MVP
+#       - local: model_doc/nezha
+#         title: NEZHA
+#       - local: model_doc/nllb
+#         title: NLLB
+#       - local: model_doc/nllb-moe
+#         title: NLLB-MoE
+#       - local: model_doc/nystromformer
+#         title: Nyströmformer
+#       - local: model_doc/olmo
+#         title: OLMo
+#       - local: model_doc/open-llama
+#         title: Open-Llama
+#       - local: model_doc/opt
+#         title: OPT
+#       - local: model_doc/pegasus
+#         title: Pegasus
+#       - local: model_doc/pegasus_x
+#         title: PEGASUS-X
+#       - local: model_doc/persimmon
+#         title: Persimmon
+#       - local: model_doc/phi
+#         title: Phi
+#       - local: model_doc/phi3
+#         title: Phi-3
+#       - local: model_doc/phobert
+#         title: PhoBERT
+#       - local: model_doc/plbart
+#         title: PLBart
+#       - local: model_doc/prophetnet
+#         title: ProphetNet
+#       - local: model_doc/qdqbert
+#         title: QDQBert
+#       - local: model_doc/qwen2
+#         title: Qwen2
+#       - local: model_doc/qwen2_moe
+#         title: Qwen2MoE
+#       - local: model_doc/rag
+#         title: RAG
+#       - local: model_doc/realm
+#         title: REALM
+#       - local: model_doc/recurrent_gemma
+#         title: RecurrentGemma
+#       - local: model_doc/reformer
+#         title: Reformer
+#       - local: model_doc/rembert
+#         title: RemBERT
+#       - local: model_doc/retribert
+#         title: RetriBERT
+#       - local: model_doc/roberta
+#         title: RoBERTa
+#       - local: model_doc/roberta-prelayernorm
+#         title: RoBERTa-PreLayerNorm
+#       - local: model_doc/roc_bert
+#         title: RoCBert
+#       - local: model_doc/roformer
+#         title: RoFormer
+#       - local: model_doc/rwkv
+#         title: RWKV
+#       - local: model_doc/splinter
+#         title: Splinter
+#       - local: model_doc/squeezebert
+#         title: SqueezeBERT
+#       - local: model_doc/stablelm
+#         title: StableLm
+#       - local: model_doc/starcoder2
+#         title: Starcoder2
+#       - local: model_doc/switch_transformers
+#         title: SwitchTransformers
+#       - local: model_doc/t5
+#         title: T5
+#       - local: model_doc/t5v1.1
+#         title: T5v1.1
+#       - local: model_doc/tapex
+#         title: TAPEX
+#       - local: model_doc/transfo-xl
+#         title: Transformer XL
+#       - local: model_doc/ul2
+#         title: UL2
+#       - local: model_doc/umt5
+#         title: UMT5
+#       - local: model_doc/xmod
+#         title: X-MOD
+#       - local: model_doc/xglm
+#         title: XGLM
+#       - local: model_doc/xlm
+#         title: XLM
+#       - local: model_doc/xlm-prophetnet
+#         title: XLM-ProphetNet
+#       - local: model_doc/xlm-roberta
+#         title: XLM-RoBERTa
+#       - local: model_doc/xlm-roberta-xl
+#         title: XLM-RoBERTa-XL
+#       - local: model_doc/xlm-v
+#         title: XLM-V
+#       - local: model_doc/xlnet
+#         title: XLNet
+#       - local: model_doc/yoso
+#         title: YOSO
+#       title: Text models
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/beit
+#         title: BEiT
+#       - local: model_doc/bit
+#         title: BiT
+#       - local: model_doc/conditional_detr
+#         title: Conditional DETR
+#       - local: model_doc/convnext
+#         title: ConvNeXT
+#       - local: model_doc/convnextv2
+#         title: ConvNeXTV2
+#       - local: model_doc/cvt
+#         title: CVT
+#       - local: model_doc/deformable_detr
+#         title: Deformable DETR
+#       - local: model_doc/deit
+#         title: DeiT
+#       - local: model_doc/depth_anything
+#         title: Depth Anything
+#       - local: model_doc/deta
+#         title: DETA
+#       - local: model_doc/detr
+#         title: DETR
+#       - local: model_doc/dinat
+#         title: DiNAT
+#       - local: model_doc/dinov2
+#         title: DINOV2
+#       - local: model_doc/dit
+#         title: DiT
+#       - local: model_doc/dpt
+#         title: DPT
+#       - local: model_doc/efficientformer
+#         title: EfficientFormer
+#       - local: model_doc/efficientnet
+#         title: EfficientNet
+#       - local: model_doc/focalnet
+#         title: FocalNet
+#       - local: model_doc/glpn
+#         title: GLPN
+#       - local: model_doc/imagegpt
+#         title: ImageGPT
+#       - local: model_doc/levit
+#         title: LeViT
+#       - local: model_doc/mask2former
+#         title: Mask2Former
+#       - local: model_doc/maskformer
+#         title: MaskFormer
+#       - local: model_doc/mobilenet_v1
+#         title: MobileNetV1
+#       - local: model_doc/mobilenet_v2
+#         title: MobileNetV2
+#       - local: model_doc/mobilevit
+#         title: MobileViT
+#       - local: model_doc/mobilevitv2
+#         title: MobileViTV2
+#       - local: model_doc/nat
+#         title: NAT
+#       - local: model_doc/poolformer
+#         title: PoolFormer
+#       - local: model_doc/pvt
+#         title: Pyramid Vision Transformer (PVT)
+#       - local: model_doc/pvt_v2
+#         title: Pyramid Vision Transformer v2 (PVTv2)
+#       - local: model_doc/regnet
+#         title: RegNet
+#       - local: model_doc/resnet
+#         title: ResNet
+#       - local: model_doc/segformer
+#         title: SegFormer
+#       - local: model_doc/seggpt
+#         title: SegGpt
+#       - local: model_doc/superpoint
+#         title: SuperPoint
+#       - local: model_doc/swiftformer
+#         title: SwiftFormer
+#       - local: model_doc/swin
+#         title: Swin Transformer
+#       - local: model_doc/swinv2
+#         title: Swin Transformer V2
+#       - local: model_doc/swin2sr
+#         title: Swin2SR
+#       - local: model_doc/table-transformer
+#         title: Table Transformer
+#       - local: model_doc/upernet
+#         title: UperNet
+#       - local: model_doc/van
+#         title: VAN
+#       - local: model_doc/vit
+#         title: Vision Transformer (ViT)
+#       - local: model_doc/vit_hybrid
+#         title: ViT Hybrid
+#       - local: model_doc/vitdet
+#         title: ViTDet
+#       - local: model_doc/vit_mae
+#         title: ViTMAE
+#       - local: model_doc/vitmatte
+#         title: ViTMatte
+#       - local: model_doc/vit_msn
+#         title: ViTMSN
+#       - local: model_doc/yolos
+#         title: YOLOS
+#       title: Vision models
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/audio-spectrogram-transformer
+#         title: Audio Spectrogram Transformer
+#       - local: model_doc/bark
+#         title: Bark
+#       - local: model_doc/clap
+#         title: CLAP
+#       - local: model_doc/encodec
+#         title: EnCodec
+#       - local: model_doc/hubert
+#         title: Hubert
+#       - local: model_doc/mctct
+#         title: MCTCT
+#       - local: model_doc/mms
+#         title: MMS
+#       - local: model_doc/musicgen
+#         title: MusicGen
+#       - local: model_doc/musicgen_melody
+#         title: MusicGen Melody
+#       - local: model_doc/pop2piano
+#         title: Pop2Piano
+#       - local: model_doc/seamless_m4t
+#         title: Seamless-M4T
+#       - local: model_doc/seamless_m4t_v2
+#         title: SeamlessM4T-v2
+#       - local: model_doc/sew
+#         title: SEW
+#       - local: model_doc/sew-d
+#         title: SEW-D
+#       - local: model_doc/speech_to_text
+#         title: Speech2Text
+#       - local: model_doc/speech_to_text_2
+#         title: Speech2Text2
+#       - local: model_doc/speecht5
+#         title: SpeechT5
+#       - local: model_doc/unispeech
+#         title: UniSpeech
+#       - local: model_doc/unispeech-sat
+#         title: UniSpeech-SAT
+#       - local: model_doc/univnet
+#         title: UnivNet
+#       - local: model_doc/vits
+#         title: VITS
+#       - local: model_doc/wav2vec2
+#         title: Wav2Vec2
+#       - local: model_doc/wav2vec2-bert
+#         title: Wav2Vec2-BERT
+#       - local: model_doc/wav2vec2-conformer
+#         title: Wav2Vec2-Conformer
+#       - local: model_doc/wav2vec2_phoneme
+#         title: Wav2Vec2Phoneme
+#       - local: model_doc/wavlm
+#         title: WavLM
+#       - local: model_doc/whisper
+#         title: Whisper
+#       - local: model_doc/xls_r
+#         title: XLS-R
+#       - local: model_doc/xlsr_wav2vec2
+#         title: XLSR-Wav2Vec2
+#       title: Audio models
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/timesformer
+#         title: TimeSformer
+#       - local: model_doc/videomae
+#         title: VideoMAE
+#       - local: model_doc/vivit
+#         title: ViViT
+#       title: Video models
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/align
+#         title: ALIGN
+#       - local: model_doc/altclip
+#         title: AltCLIP
+#       - local: model_doc/blip
+#         title: BLIP
+#       - local: model_doc/blip-2
+#         title: BLIP-2
+#       - local: model_doc/bridgetower
+#         title: BridgeTower
+#       - local: model_doc/bros
+#         title: BROS
+#       - local: model_doc/chinese_clip
+#         title: Chinese-CLIP
+#       - local: model_doc/clip
+#         title: CLIP
+#       - local: model_doc/clipseg
+#         title: CLIPSeg
+#       - local: model_doc/clvp
+#         title: CLVP
+#       - local: model_doc/data2vec
+#         title: Data2Vec
+#       - local: model_doc/deplot
+#         title: DePlot
+#       - local: model_doc/donut
+#         title: Donut
+#       - local: model_doc/flava
+#         title: FLAVA
+#       - local: model_doc/git
+#         title: GIT
+#       - local: model_doc/grounding-dino
+#         title: Grounding DINO
+#       - local: model_doc/groupvit
+#         title: GroupViT
+#       - local: model_doc/idefics
+#         title: IDEFICS
+#       - local: model_doc/idefics2
+#         title: Idefics2
+#       - local: model_doc/instructblip
+#         title: InstructBLIP
+#       - local: model_doc/kosmos-2
+#         title: KOSMOS-2
+#       - local: model_doc/layoutlm
+#         title: LayoutLM
+#       - local: model_doc/layoutlmv2
+#         title: LayoutLMV2
+#       - local: model_doc/layoutlmv3
+#         title: LayoutLMV3
+#       - local: model_doc/layoutxlm
+#         title: LayoutXLM
+#       - local: model_doc/lilt
+#         title: LiLT
+#       - local: model_doc/llava
+#         title: Llava
+#       - local: model_doc/llava_next
+#         title: LLaVA-NeXT
+#       - local: model_doc/lxmert
+#         title: LXMERT
+#       - local: model_doc/matcha
+#         title: MatCha
+#       - local: model_doc/mgp-str
+#         title: MGP-STR
+#       - local: model_doc/nougat
+#         title: Nougat
+#       - local: model_doc/oneformer
+#         title: OneFormer
+#       - local: model_doc/owlvit
+#         title: OWL-ViT
+#       - local: model_doc/owlv2
+#         title: OWLv2
+#       - local: model_doc/paligemma
+#         title: PaliGemma
+#       - local: model_doc/perceiver
+#         title: Perceiver
+#       - local: model_doc/pix2struct
+#         title: Pix2Struct
+#       - local: model_doc/sam
+#         title: Segment Anything
+#       - local: model_doc/siglip
+#         title: SigLIP
+#       - local: model_doc/speech-encoder-decoder
+#         title: Speech Encoder Decoder Models
+#       - local: model_doc/tapas
+#         title: TAPAS
+#       - local: model_doc/trocr
+#         title: TrOCR
+#       - local: model_doc/tvlt
+#         title: TVLT
+#       - local: model_doc/tvp
+#         title: TVP
+#       - local: model_doc/udop
+#         title: UDOP
+#       - local: model_doc/video_llava
+#         title: VideoLlava
+#       - local: model_doc/vilt
+#         title: ViLT
+#       - local: model_doc/vipllava
+#         title: VipLlava
+#       - local: model_doc/vision-encoder-decoder
+#         title: Vision Encoder Decoder Models
+#       - local: model_doc/vision-text-dual-encoder
+#         title: Vision Text Dual Encoder
+#       - local: model_doc/visual_bert
+#         title: VisualBERT
+#       - local: model_doc/xclip
+#         title: X-CLIP
+#       title: Multimodal models
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/decision_transformer
+#         title: محول القرار
+#       - local: model_doc/trajectory_transformer
+#         title: محول المسار
+#       title: نماذج التعلم التعزيزية
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/autoformer
+#         title: Autoformer
+#       - local: model_doc/informer
+#         title: Informer
+#       - local: model_doc/patchtsmixer
+#         title: PatchTSMixer
+#       - local: model_doc/patchtst
+#         title: PatchTST
+#       - local: model_doc/time_series_transformer
+#         title: محول السلاسل الزمنية
+#       title: نماذج السلاسل الزمنية
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/graphormer
+#         title: Graphormer
+#       title: نماذج الرسم البياني
+#     title: النماذج
+#   - sections:
+#     - local: internal/modeling_utils
+#       title: الطبقات المخصصة والمرافق
+#     - local: internal/pipelines_utils
+#       title: مرافق خطوط الأنابيب
+#     - local: internal/tokenization_utils
+#       title: مرافق مقسم النصوص 
+#     - local: internal/trainer_utils
+#       title: مرافق المدرب
+#     - local: internal/generation_utils
+#       title: مرافق التوليد
+#     - local: internal/image_processing_utils
+#       title: مرافق معالجة الصور
+#     - local: internal/audio_utils
+#       title: مرافق معالجة الصوت
+#     - local: internal/file_utils
+#       title: مرافق عامة
+#     - local: internal/time_series_utils
+#       title: مرافق السلاسل الزمنية
+#     title: مساعدون داخليون
+#   title: API
diff --git a/docs/source/ar/accelerate.md b/docs/source/ar/accelerate.md
new file mode 100644
index 000000000000..486c1efe59af
--- /dev/null
+++ b/docs/source/ar/accelerate.md
@@ -0,0 +1,120 @@
+# التدريب الموزع باستخدام  🤗 Accelerate
+
+
+مع تزايد حجم النماذج اللغوية، برز التوازي كأحد الاستراتيجيات لتدريب نماذج أكبر على أجهزة محدودة وتسريع عملية التدريب بمقدار كبير.  أنشأنا في Hugging Face، قمنا بإنشاء مكتبة [ Accelerate](https://huggingface.co/docs/accelerate) لمساعدة المستخدمين على تدريب أي نموذج من  Transformers بسهولة على أي نوع من الإعدادات الموزعة، سواء كان ذلك على عدة وحدات معالجة رسومات (GPUs) على جهاز واحد أو على عدة وحدات معالجة رسومات موزعة على عدة أجهزة. في هذا الدليل، تعلم كيفية تخصيص حلقة تدريب PyTorch الأصلية لتمكين التدريب في بيئة موزعة.
+
+## الإعداد
+
+ابدأ بتثبيت 🤗 Accelerate:
+
+```bash
+pip install accelerate
+```
+
+ثم قم باستيراد وإنشاء كائن [`~accelerate.Accelerator`]. سيقوم [`~accelerate.Accelerator`] تلقائيًا باكتشاف نوع الإعداد الموزع الخاص بك وتهيئة جميع المكونات اللازمة للتدريب. لن تحتاج إلى وضع نموذجك على جهاز بشكل معين.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## الاستعداد للتسريع
+
+الخطوة التالية هي تمرير جميع كائنات التدريب ذات الصلة إلى دالة الإعداد [`~accelerate.Accelerator.prepare`]. ويشمل ذلك DataLoaders للتدريب والتقييم، ونموذجًا ومُحَسِّنً المعاملات (optimizer):
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## الخلفي Backward
+
+الإضافة الأخيرة هي استبدال الدالة المعتادة `loss.backward()` في حلقة التدريب الخاصة بك بدالة [`~accelerate.Accelerator.backward`] في 🤗 Accelerate:
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+كما يمكنك أن ترى في الكود التالي، فأنت بحاجة فقط إلى إضافة أربعة أسطر من الكود إلى حلقة التدريب الخاصة بك لتمكين التدريب الموزع!
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## تدريب
+
+بمجرد إضافة أسطر الكود ذات الصلة، قم بتشغيل التدريب الخاص بك في أحد النصوص أو الدفاتر مثل Colaboratory.
+
+### التدريب باستخدام نص برمجي
+
+إذا كنت تشغل التدريب الخاص بك من نص برمجي، فقم بتشغيل الأمر التالي لإنشاء وحفظ ملف تكوين:
+
+```bash
+accelerate config
+```
+
+ثم قم بتشغيل التدريب الخاص بك باستخدام:
+
+```bash
+accelerate launch train.py
+```
+
+### التدريب باستخدام دفتر ملاحظات
+
+يمكن أيضًا تشغيل 🤗 Accelerate في دفاتر إذا كنت تخطط لاستخدام وحدات معالجة الرسوميات (TPUs) في Colaboratory. قم بتغليف كل الكود المسؤول عن التدريب في دالة، ومررها إلى [`~accelerate.notebook_launcher`]:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+للحصول على مزيد من المعلومات حول 🤗 Accelerate وميزاته الغنية، يرجى الرجوع إلى [الوثائق](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/source/ar/agents.md b/docs/source/ar/agents.md
new file mode 100644
index 000000000000..92b2a4715f6f
--- /dev/null
+++ b/docs/source/ar/agents.md
@@ -0,0 +1,539 @@
+# الوكلاء والأدوات
+
+[[open-in-colab]]
+
+### ما هو الوكيل؟
+
+يمكن للنظم اللغوية الكبيرة (LLMs) التي تم تدريبها على أداء [نمذجة اللغة السببية](./tasks/language_modeling.) التعامل مع مجموعة واسعة من المهام، ولكنها غالبًا ما تواجه صعوبات في المهام الأساسية مثل المنطق والحساب والبحث. وعندما يتم استدعاؤها في مجالات لا تؤدي فيها أداءً جيدًا، فإنها غالبًا ما تفشل في توليد الإجابة التي نتوقعها منها.
+
+يتمثل أحد النهج للتغلب على هذا القصور في إنشاء "وكيل".
+
+الوكيل هو نظام يستخدم LLM كمحرك له، ولديه حق الوصول إلى وظائف تسمى "أدوات".
+
+هذه "الأدوات" هي وظائف لأداء مهمة، وتحتوي على جميع الأوصاف اللازمة للوكيل لاستخدامها بشكل صحيح.
+
+يمكن برمجة الوكيل للقيام بما يلي:
+- وضع سلسلة من الإجراءات/الأدوات وتشغيلها جميعًا في نفس الوقت مثل [`CodeAgent`] على سبيل المثال
+- التخطيط للاجراءات/الأدوات وتنفيذها واحدة تلو الأخرى والانتظار حتى انتهاء كل إجراء قبل إطلاق التالي مثل [`ReactJsonAgent`] على سبيل المثال
+
+### أنواع الوكلاء
+
+#### الوكيل البرمجي (Code agent)
+
+يتمتع هذا الوكيل يتبع خطوات محددة: أولًا، يخطط لسلسلة من الإجراءات التي يريد تنفيذها، ثم شفرة Python لتنفيذ جميع الإجراءات في نفس الوقت. وهو يتعامل بشكل أصلي مع أنواع مختلفة من المدخلات والمخرجات للأدوات التي يستخدمها، وبالتالي فهو الخيار الموصى به للمهام متعددة الوسائط.
+
+#### وكلاء التفاعل
+
+هذا هو الوكيل الذي يتم اللجوء إليه لحل مهام الاستدلال، حيث يجعل إطار ReAct ([Yao et al.، 2022](https://huggingface.co/papers/2210.03629)) من الكفاءة حقًا التفكير على أساس ملاحظاته السابقة.
+
+نقوم بتنفيذ إصدارين من ReactJsonAgent: 
+- [`ReactJsonAgent`] يقوم بتوليد استدعاءات الأدوات كـ JSON في إخراجها.
+- [`ReactCodeAgent`] هو نوع جديد من ReactJsonAgent يقوم بتوليد استدعاءات أدواته كمقاطع من التعليمات البرمجية، والتي تعمل بشكل جيد حقًا مع LLMs التي تتمتع بأداء  قوي في البرمجة.
+
+> [!TIP]
+> اقرأ منشور المدونة [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) لمعرفة المزيد عن وكيل ReAct.
+
+![إطار عمل وكيل ReAct](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png)
+
+على سبيل المثال، إليك كيف يعمل وكيل ReAct Code طريقه من خلال السؤال التالي.
+
+```py3
+>>> agent.run(
+...     "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?",
+... )
+=====New task=====
+How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?
+====Agent is executing the code below:
+bert_blocks = search(query="number of blocks in BERT base encoder")
+print("BERT blocks:", bert_blocks)
+====
+Print outputs:
+BERT blocks: twelve encoder blocks
+
+====Agent is executing the code below:
+attention_layer = search(query="number of layers in Attention is All You Need")
+print("Attention layers:", attention_layer)
+====
+Print outputs:
+Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture.
+
+====Agent is executing the code below:
+bert_blocks = 12
+attention_layers = 6
+diff = bert_blocks - attention_layers
+print("Difference in blocks:", diff)
+final_answer(diff)
+====
+
+Print outputs:
+Difference in blocks: 6
+
+Final answer: 6
+```
+
+### كيف يمكنني بناء وكيل؟
+
+لتهيئة وكيل، تحتاج إلى هذه الوسائط:
+
+- نموذج لغوي كبير (LLM) يشكل المحرك الأساسي للوكيل. الوكيل نفسه ليس النموذج اللغوي، بل هو برنامج يستخدم النموذج اللغوي كمحرك له.
+- موجه النظام (system prompt): هذه هي التعليمات التي يتم إعطاؤها للنموذج اللغوي لإنشاء مخرجاته.
+- صندوق أدوات (toolbox) يختار الوكيل منه الأدوات لتنفيذها
+- محلل (parser) لاستخراج الأدوات التي يجب استدعاؤها من مخرجات النموذج اللغوي LLM والأدوات التي يجب استخدامها
+
+عند تهيئة نظام الوكيل، يتم استخدام سمات الأداة لإنشاء وصف للأداة، ثم يتم دمجها في موجه النظام الخاص `system_prompt` للوكيل لإعلامه بالأدوات التي يمكنه استخدامها ولماذا.
+
+للبدء، يرجى تثبيت `agents` الإضافية لتثبيت جميع التبعيات الافتراضية.
+
+```bash
+pip install transformers[agents]
+```
+
+قم ببناء محرك LLM الخاص بك من خلال تعريف طريقة `llm_engine` التي تقبل قائمة من [الرسائل](./chat_templating.) وتعيد النص. يجب أن تقبل هذه الدالة القابلة للاستدعاء أيضًا معامل `stop` يشير إلى متى يجب التوقف عن التوليد.
+
+```python
+from huggingface_hub import login, InferenceClient
+
+login("<YOUR_HUGGINGFACEHUB_API_TOKEN>")
+
+client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct")
+
+def llm_engine(messages, stop_sequences=["Task"]) -> str:
+    response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
+    answer = response.choices[0].message.content
+    return answer
+```
+
+يمكنك استخدام أي طريقة `llm_engine` طالما أنها:
+1. يتبع تنسيق [رسائل](./chat_templating.md) لإدخاله (`List [Dict [str، str]]`) ويعيد `str`
+2. يتوقف عن توليد المخراجات من التسلسلات التي تم تمريرها في معامل `stop`
+
+أنت بحاجة أيضًا إلى معامل "الأدوات" الذي يقبل قائمة من "الأدوات". يمكنك توفير قائمة فارغة لـ "الأدوات"، ولكن استخدم صندوق الأدوات الافتراضي مع معامل اختياري `add_base_tools=True`.
+
+الآن يمكنك إنشاء وكيل، مثل [`CodeAgent`], وتشغيله. ولتسهيل الأمر، نقدم أيضًا فئة [`HfEngine`] التي تستخدم `huggingface_hub.InferenceClient` بشكل مخفى.
+
+```python
+from transformers import CodeAgent, HfEngine
+
+llm_engine = HfEngine(model="meta-llama/Meta-Llama-3-70B-Instruct")
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run(
+    "Could you translate this sentence from French, say it out loud and return the audio.",
+    sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+هذه الميزة ستكون مفيدة في حالة الحاجة الملحة! يمكنك حتى ترك معامل `llm_engine` غير محدد، وسيتم إنشاء [`HfEngine`] بشكل تلقائي.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], add_base_tools=True)
+
+agent.run(
+    "Could you translate this sentence from French, say it out loud and give me the audio.",
+    sentence="Où est la boulangerie la plus proche?",
+)
+```
+
+لاحظ أننا استخدمنا معامل "sentence" إضافي: يمكنك تمرير النص كمعامل إضافي إلى النموذج.
+
+يمكنك أيضًا استخدام هذا للإشارة إلى مسار الملفات المحلية أو البعيدة للنموذج لاستخدامها:
+
+```py
+from transformers import ReactCodeAgent
+
+agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+
+agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3")
+```
+
+
+تم تحديد موجه النظام ومحلل المخرجات تلقائيًا، ولكن يمكنك فحصهما بسهولة عن طريق استدعاء `system_prompt_template` على وكيلك.
+
+```python
+print(agent.system_prompt_template)
+```
+
+من المهم أن تشرح بأكبر قدر ممكن من الوضوح المهمة التي تريد تنفيذها.
+كل عملية [`~Agent.run`] مستقلة، وبما أن الوكيل مدعوم من LLM، فقد تؤدي الاختلافات الطفيفة في موجهك إلى نتائج مختلفة تمامًا.
+يمكنك أيضًا تشغيل وكيل بشكل متتالي لمهام مختلفة: في كل مرة يتم فيها إعادة تهيئة سمتي `agent.task` و`agent.logs`.
+
+
+#### تنفيذ التعليمات البرمجية
+
+يقوم مفسر Python بتنفيذ التعليمات البرمجية على مجموعة من المدخلات التي يتم تمريرها جنبًا إلى جنب مع أدواتك.
+يجب أن يكون هذا الأمر آمنًا لأن الوظائف الوحيدة التي يمكن استدعاؤها هي الأدوات التي قدمتها (خاصة إذا كانت أدوات من Hugging Face فقط) ووظيفة الطباعة، لذا فأنت مقيد بالفعل بما يمكن تنفيذه.
+
+مفسر Python لا يسمح أيضًا باستدعاء دوال بشكل افتراضي خارج قائمة آمنة، لذا فإن جميع الهجمات الأكثر وضوحًا لا ينبغي أن تكون مشكلة.
+يمكنك أيضًا الإذن باستيرادات إضافية عن طريق تمرير الوحدات النمطية المصرح بها كقائمة من السلاسل في معامل  `additional_authorized_imports` عند تهيئة [`ReactCodeAgent`] أو [`CodeAgent`]:
+
+```py
+>>> from transformers import ReactCodeAgent
+
+>>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4'])
+>>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
+
+(...)
+'Hugging Face – Blog'
+```
+
+سيتم إيقاف التنفيذ عند أي رمز يحاول تنفيذ عملية غير قانونية أو إذا كان هناك خطأ Python عادي في التعليمات البرمجية التي تم إنشاؤها بواسطة الوكيل.
+
+> [!WARNING]
+> يمكن لـ LLM توليد شفرة برمجية عشوائية سيتم تنفيذها بعد ذلك: لا تقمب استدعاء أى دوال غير آمنة!
+
+### موجه النظام
+
+ينشئ الوكيل، أو بالأحرى LLM الذي يقود الوكيل، يولد مخرجات بناءً على موجه النظام. يمكن تخصيص موجه النظام وتصميمه للمهام المقصودة. على سبيل المثال، تحقق من موجه النظام لـ [`ReactCodeAgent`] (الإصدار أدناه مبسط قليلاً).
+
+```text
+You will be given a task to solve as best you can.
+You have access to the following tools:
+<<tool_descriptions>>
+
+To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
+
+At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use.
+Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence.
+During each intermediate step, you can use 'print()' to save whatever important information you will then need.
+These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step.
+
+In the end you have to return a final answer using the `final_answer` tool.
+
+Here are a few examples using notional tools:
+---
+{examples}
+
+Above example were using notional tools that might not exist for you. You only have acces to those tools:
+<<tool_names>>
+You also can perform computations in the python code you generate.
+
+Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```<end_code>' sequence. You MUST provide at least the 'Code:' sequence to move forward.
+
+Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks.
+Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result.
+
+Remember to make sure that variables you use are all defined.
+
+Now Begin!
+```
+
+يتضمن موجه النظام:
+- *مقدمة* تشرح كيف يجب أن يتصرف الوكيل والأدوات التي يجب عليه استخدامها.
+- وصف لجميع الأدوات التي يتم تحديدها بواسطة رمز `<<tool_descriptions>>` الذي يتم استبداله ديناميكيًا في وقت التشغيل بالأدوات التي يحددها المستخدم أو يختارها.
+    - يأتي وصف الأداة من سمات الأداة، `name`، و`description`، و`inputs` و`output_type`، وقالب `jinja2` بسيط يمكنك تحسينه.
+- شكل المخرج المتوقع.
+
+يمكنك تحسين موجه النظام، على سبيل المثال، عن طريق إضافة شرح لتنسيق المخرجات.
+
+للحصول على أقصى قدر من المرونة، يمكنك الكتابة فوق قالب موجه النظام بالكامل عن طريق تمرير موجه مخصص كمعامل إلى معلمة `system_prompt`.
+
+```python
+from transformers import ReactJsonAgent
+from transformers.agents import PythonInterpreterTool
+
+agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}")
+```
+
+> [!WARNING]
+> يرجى التأكد من تحديد سلسلة `<<tool_descriptions>>` في مكان ما في `template` حتى يكون الوكيل على علم 
+بالأدوات المتاحة.
+
+
+### فحص تشغيل الوكيل
+
+فيما يلي بعض السمات المفيدة لفحص ما حدث بعد التشغيل:
+- تخزن  `agent.logs` سجلات مفصلة للوكيل. في كل خطوة من تشغيل الوكيل، يتم تخزين كل شيء في قاموس إلحاقه بـ `agent.logs`.
+- تشغيل `agent.write_inner_memory_from_logs()` يخلق ذاكرة داخلية لسجلات الوكيل للنظام LLM لعرضها، كقائمة من رسائل الدردشة. تنتقل هذه الطريقة عبر كل خطوة من سجل الوكيل ولا تخزن سوى ما يهمها كرسالة: على سبيل المثال، سيحفظ موجه النظام والمهمة في رسائل منفصلة، ثم لكل خطوة سيخزن مخرج LLM كرسالة، ومخرج استدعاء الأداة كرسالة أخرى. استخدم هذا إذا كنت تريد عرضًا عامًا لما حدث - ولكن لن يتم نسخ كل سجل بواسطة هذه الطريقة.
+
+## الأدوات
+
+الأداة هي عبارة عن وظيفة أساسية يستخدمها الوكيل لتنفيذ مهمة محددة.
+
+يمكنك على سبيل المثال التحقق من [`PythonInterpreterTool`]: لديه اسم ووصف ووصف للمدخلات ونوع للمخرج، وطريقة `__call__` التي تقوم بتنفيذ المهمة المطلوبة.
+
+عند تهيئة الوكيل، يتم استخدام سمات الأداة لتوليد وصف للأداة يتم تضمينه في موجه النظام الخاص بالوكيل. يتيح هذا للوكيل معرفة الأدوات التي يمكنه استخدامها ولماذا.
+
+### صندوق الأدوات الافتراضي
+
+يأتي Transformers مع صندوق أدوات افتراضي لتمكين الوكلاء، والذي يمكنك إضافته إلى وكيلك عند التهيئة باستخدام معامل `add_base_tools = True`:
+
+- **الإجابة على أسئلة المستند**: الإجابة على سؤال حول المستند (مثل ملف PDF) بتنسيق صورة ([Donut](./model_doc/donut))
+- **الإجابة على أسئلة الصور**: الإجابة على سؤال حول صورة ([VILT](./model_doc/vilt))
+- **التحدث إلى النص**: قم بتفريغ الكلام إلى نص ([Whisper](./model_doc/whisper))
+- **النص إلى كلام**: تحويل النص إلى كلام ([SpeechT5](./model_doc/speecht5))
+- **الترجمة**: ترجمة جملة معينة من لغة المصدر إلى لغة الهدف.
+- **مفسر كود Python**: تشغيل كود Python الذي تم إنشاؤه بواسطة LLM في بيئة آمنة. لن يتم إضافة هذه الأداة إلى [`ReactJsonAgent`] إلا إذا استخدمت `add_base_tools=True`، نظرًا لأن الأدوات المستندة إلى التعليمات البرمجية يمكنها بالفعل تنفيذ كود Python
+لا تترجم النصوص الخاصة ولا الأكواد البرمجية ولا الروابط ولا رموز HTML وCSS:
+
+يمكنك استخدام أداة يدويًا عن طريق استدعاء دالة [`load_tool`] وتحديد مهمة لتنفيذها.
+
+```python
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### إنشاء أداة جديدة
+
+يمكنك إنشاء أداتك الخاصة لتغطية حالات الاستخدام التي لا تغطيها الأدوات الافتراضية من Hugging Face.
+على سبيل المثال، دعنا نقوم بإنشاء أداة تعرض النموذج الأكثر تنزيلًا لمهمة معينة من Hub.
+
+سوف نبدأ بالكود التالي.
+
+```python
+from huggingface_hub import list_models
+
+task = "text-classification"
+
+model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+print(model.id)
+```
+
+يمكن تحويل هذه الشيفرة إلى فئة ترث من الفئة العليا [`Tool`].
+
+تحتاج الأداة المخصصة إلى:
+
+- اسم `name`، والتي تمثل اسم الأداة نفسها. عادةً ما يصف الاسم وظيفتها. بما أن الكود يعيد النموذج الأكثر تنزيلًا لمهمة ما، فلنسمها `model_download_counter`.
+- تستخدم خاصية `description` لملء موجه نظام الوكيل.
+- خاصية `inputs`، والتي هي عبارة عن قاموس بمفاتيح "type" و"description". يحتوي على معلومات تساعد المفسر Python على اتخاذ خيارات مستنيرة بشأن المدخلات.
+- خاصية `output_type`، والتي تحدد نوع المخرج.
+- طريقة `forward` والتي تحتوي على الكود الذي سيتم تنفيذه للحصول على النتيجة النهائية.
+
+```python
+from transformers import Tool
+from huggingface_hub import list_models
+
+class HFModelDownloadsTool(Tool):
+    name = "model_download_counter"
+    description = (
+        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
+        "It returns the name of the checkpoint."
+    )
+
+    inputs = {
+        "task": {
+            "type": "text",
+            "description": "the task category (such as text-classification, depth-estimation, etc)",
+        }
+    }
+    output_type = "text"
+
+    def forward(self, task: str):
+        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+        return model.id
+```
+
+الآن بعد أن أصبحت فئة `HfModelDownloadsTool` المخصصة جاهزة، يمكنك حفظها في ملف باسم `model_downloads.py` واستيرادها للاستخدام.
+
+```python
+from model_downloads import HFModelDownloadsTool
+
+tool = HFModelDownloadsTool()
+```
+
+يمكنك أيضًا مشاركة أداتك المخصصة في Hub عن طريق استدعاء [`~Tool.push_to_hub`] على الأداة. تأكد من أنك قمت بإنشاء مستودع لها على Hub وأنك تستخدم رمز وصول للقراءة.
+
+```python
+tool.push_to_hub("{your_username}/hf-model-downloads")
+```
+
+قم بتحميل الأداة باستخدام دالة [`~Tool.load_tool`] ومررها إلى معلمة `tools` في الوكيل الخاص بك.
+
+```python
+from transformers import load_tool, CodeAgent
+
+model_download_tool = load_tool("m-ric/hf-model-downloads")
+agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
+agent.run(
+    "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
+)
+```
+
+ستحصل على ما يلي:
+
+```text
+======== New task ========
+Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
+==== Agent is executing the code below:
+most_downloaded_model = model_download_counter(task="text-to-video")
+print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
+====
+```
+
+والناتج:
+
+`"النموذج الأكثر تنزيلًا لمهمة `text-to-video` هو ByteDance/AnimateDiff-Lightning."`
+
+### إدارة صندوق أدوات الوكيل الخاص بك
+
+إذا كنت قد قمت بتهيئة وكيل، فمن غير الملائم إعادة تهيئته من البداية لإضافة أداة جديدة ترغب في استخدامها. باستخدام مكتبة Transformers، يمكنك إدارة صندوق أدوات الوكيل بإضافة أو استبدال أداة موجودة.
+
+دعنا نضيف الأداة `model_download_tool` إلى وكيل تم تهيئته مسبقًا باستخدام صندوق الأدوات الافتراضي.
+
+```python
+from transformers import CodeAgent
+
+agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True)
+agent.toolbox.add_tool(model_download_tool)
+```
+
+الآن يمكننا الاستفادة من الأداة الجديدة وأداة تحويل النص إلى كلام السابقة:
+
+```python
+    agent.run(
+        "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?"
+    )
+```
+
+| **Audio**                                                                                                                                            |
+|------------------------------------------------------------------------------------------------------------------------------------------------------|
+| <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/damo.wav" type="audio/wav"/> |
+
+> [!WARNING]
+> احترس عند إضافة أدوات إلى وكيل يعمل بالفعل لأنه يمكن أن يؤثر على اختيار الأداة لصالح أداتك أو اختيار أداة أخرى غير المحددة بالفعل.
+
+استخدم طريقة `agent.toolbox.update_tool()` لاستبدال أداة موجودة في صندوق أدوات الوكيل.
+هذا مفيد إذا كانت أداتك الجديدة بديلاً مباشرًا للأداة الموجودة لأن الوكيل يعرف بالفعل كيفية تنفيذ تلك المهمة المحددة.
+تأكد فقط من اتباع الأداة الجديدة لنفس واجهة برمجة التطبيقات (API) للأداة المستبدلة أو قم بتكييف قالب موجه النظام لضمان تحديث جميع الأمثلة التي تستخدم الأداة المستبدلة.
+
+### استخدام مجموعة من الأدوات
+
+يمكنك الاستفادة من مجموعات الأدوات باستخدام كائن ToolCollection، مع تحديد مجموعة الأدوات التي تريد استخدامها.
+ثم قم بتمريرها كقائمة لتهيئة الوكيل الخاص بك، وبدء استخدامها!
+
+```py
+from transformers import ToolCollection, ReactCodeAgent
+
+image_tool_collection = ToolCollection(collection_slug="huggingface-tools/diffusion-tools-6630bb19a942c2306a2cdb6f")
+agent = ReactCodeAgent(tools=[*image_tool_collection.tools], add_base_tools=True)
+
+agent.run("Please draw me a picture of rivers and lakes.")
+```
+
+لتسريع البداية، يتم تحميل الأدوات فقط إذا استدعاها الوكيل.
+
+ستحصل على هذه الصورة:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" />
+
+### استخدام gradio-tools
+
+[gradio-tools](https://github.com/freddyaboulton/gradio-tools) هي مكتبة قوية تتيح استخدام Hugging
+Face Spaces كأدوات. تدعم العديد من المساحات الموجودة بالإضافة إلى مساحات مخصصة.
+
+تدعم مكتبة Transformers `gradio_tools` باستخدام طريقة [`Tool.from_gradio`] في الفئة. على سبيل المثال، دعنا نستخدم [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) من مجموعة أدوات `gradio-tools` لتحسين المطالبات لإنشاء صور أفضل.
+
+استورد وقم بتهيئة الأداة، ثم مررها إلى طريقة `Tool.from_gradio`:
+
+```python
+from gradio_tools import StableDiffusionPromptGeneratorTool
+from transformers import Tool, load_tool, CodeAgent
+
+gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
+prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
+```
+
+الآن يمكنك استخدامه مثل أي أداة أخرى. على سبيل المثال، دعنا نحسن الموجه `a rabbit wearing a space suit`.
+
+```python
+image_generation_tool = load_tool('huggingface-tools/text-to-image')
+agent = CodeAgent(tools=[prompt_generator_tool, image_generation_tool], llm_engine=llm_engine)
+
+agent.run(
+    "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit'
+)
+```
+
+يستفيد النموذج بشكل كافٍ من الأداة:
+
+```text
+======== New task ========
+Improve this prompt, then generate an image of it.
+You have been provided with these initial arguments: {'prompt': 'A rabbit wearing a space suit'}.
+==== Agent is executing the code below:
+improved_prompt = StableDiffusionPromptGenerator(query=prompt)
+while improved_prompt == "QUEUE_FULL":
+    improved_prompt = StableDiffusionPromptGenerator(query=prompt)
+print(f"The improved prompt is {improved_prompt}.")
+image = image_generator(prompt=improved_prompt)
+====
+```
+
+قبل إنشاء الصورة أخيرًا:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png" />
+
+> [!WARNING]
+> تتطلب gradio-tools إدخالات وإخراجات *نصية* حتى عند العمل مع طرائق مختلفة مثل كائنات الصور والصوت. الإدخالات والإخراجات الصورية والصوتية غير متوافقة حاليًا.
+
+### استخدام أدوات LangChain
+
+نحن نحب Langchain ونعتقد أنها تحتوي على مجموعة أدوات قوية للغاية.
+لاستيراد أداة من LangChain، استخدم الطريقة `from_langchain()`.
+
+فيما يلي كيفية استخدامها لإعادة إنشاء نتيجة البحث في المقدمة باستخدام أداة بحث الويب LangChain.
+
+```python
+from langchain.agents import load_tools
+from transformers import Tool, ReactCodeAgent
+
+search_tool = Tool.from_langchain(load_tools(["serpapi"])[0])
+
+agent = ReactCodeAgent(tools=[search_tool])
+
+agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
+```
+
+## واجهة Gradio
+
+يمكنك الاستفادة من `gradio.Chatbot` لعرض أفكار الوكيل الخاص بك باستخدام `stream_to_gradio`، إليك مثال:
+
+```py
+import gradio as gr
+from transformers import (
+    load_tool,
+    ReactCodeAgent,
+    HfEngine,
+    stream_to_gradio,
+)
+
+# Import tool from Hub
+image_generation_tool = load_tool("m-ric/text-to-image")
+
+llm_engine = HfEngine("meta-llama/Meta-Llama-3-70B-Instruct")
+
+# Initialize the agent with the image generation tool
+agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
+
+
+def interact_with_agent(task):
+    messages = []
+    messages.append(gr.ChatMessage(role="user", content=task))
+    yield messages
+    for msg in stream_to_gradio(agent, task):
+        messages.append(msg)
+        yield messages + [
+            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
+        ]
+    yield messages
+
+
+with gr.Blocks() as demo:
+    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
+    submit = gr.Button("Run illustrator agent!")
+    chatbot = gr.Chatbot(
+        label="Agent",
+        type="messages",
+        avatar_images=(
+            None,
+            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
+        ),
+    )
+    submit.click(interact_with_agent, [text_input], [chatbot])
+
+if __name__ == "__main__":
+    demo.launch()
+```
\ No newline at end of file
diff --git a/docs/source/ar/autoclass_tutorial.md b/docs/source/ar/autoclass_tutorial.md
new file mode 100644
index 000000000000..fe368af47273
--- /dev/null
+++ b/docs/source/ar/autoclass_tutorial.md
@@ -0,0 +1,167 @@
+# تحميل نماذج مدربة مسبقًا باستخدام AutoClass
+لم ترغب في إنشاء محول معماري لمؤشر الترابط الخاص بك، فهناك العديد من محولات المعمارية المختلفة التي يمكنك الاختيار من بينها. كجزء من الفلسفة الأساسية لـ 🤗 Transformers لجعل المكتبة سهلة وبسيطة ومرنة، فإن فئة `AutoClass` تستدل تلقائيًا وتحمّل البنية الصحيحة من نسخة نموذج (Model Checkpoint) معينة. تسمح لك طريقة `from_pretrained()` بتحميل نموذج مُدرب مسبقًا لأي بنية بسرعة حتى لا تضطر إلى تكريس الوقت والموارد لتدريب نموذج من الصفر. إن إنتاج هذا النوع من التعليمات البرمجية غير المعتمدة على نسخ يعني أنه إذا نجح رمزك مع ننسخة واحدة، فسيتم تشغيله مع أخرى - طالما تم تدريبه لمهمة مماثلة - حتى إذا كانت البنية المعمارية مختلفة.
+
+تذكر أن البنية تشير إلى هيكل النموذج، والنسخ هي الأوزان لبنية معمارية معينة. على سبيل المثال، [BERT](https://huggingface.co/google-bert/bert-base-uncased) هي بنية معمارية، في حين أن `google-bert/bert-base-uncased` هي نسخة. "النموذج" هو مصطلح عام يمكن أن يعني إما البنية أو نالنسخة.
+
+في هذا البرنامج التعليمي، ستتعلم كيفية:
+
+* تحميل مُجزّئ الرموز مُدرب مسبقًا
+* تحميل معالج صور مُدرب مسبقًا
+* تحميل مستخرج ميزات مُدرب مسبقًا
+* تحميل معالج مُدرب مسبقًا
+* تحميل نموذج مُدرب مسبقًا
+* تحميل نموذج كعمود فقري
+
+## AutoTokenizer
+
+تبدأ كل مهمة NLP تقريبًا بمُجزّئ للرموز. يقوم المُجزّئ بتحويل النص إلى شكل يمكن للنموذج معالجته.
+
+قم بتحميل المُجزّئ باستخدام [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+```
+
+ثم قم بتحليل إدخالك على النحو الموضح أدناه:
+
+```py
+>>> sequence = "In a hole in the ground there lived a hobbit."
+>>> print(tokenizer(sequence))
+{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+## معالج الصور التلقائي (AutoImageProcessor)
+ 
+
+بالنسبة لمهمات الرؤية، يقوم معالج الصور بمعالجة الصورة إلى تنسيق الإدخال الصحيح.
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+## AutoBackbone
+
+<div style="text-align: center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stages.png">
+    <figcaption class="mt-2 text-center text-sm text-gray-500">الصورة توضح مخطط مراحل نموذج Swin.</figcaption>
+</div>
+
+يسمح لك [`AutoBackbone`] باستخدام النماذج المُدربة مسبقًا كعمود فقري للحصول على خرائط ميزات من مراحل مختلفة من العمود الفقري. يجب عليك تحديد أحد المعلمات التالية في [`~PretrainedConfig.from_pretrained`]:
+
+* `out_indices` هو فهرس الطبقة التي تريد الحصول على خريطة الميزات منها
+* `out_features` هو اسم الطبقة التي تريد الحصول على خريطة الميزات منها
+
+يمكن استخدام هذه المعلمات بشكل متبادل، ولكن إذا كنت تستخدم كلاً منها، فتأكد من أنها متوائمة مع بعضها البعض! إذا لم تمرر أيًا من هذه المعلمات، فسيقوم العمود الفقري بإرجاع خريطة الميزات من الطبقة الأخيرة.
+<div style="text-align: center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png">
+    <figcaption class="mt-2 text-center text-sm text-gray-500">صورة توضح خريطة ميزات من المرحلة الأولى للعمود الفقري.</figcaption>
+</div>
+
+على سبيل المثال، في الرسم التخطيطي أعلاه، لإرجاع خريطة الميزات من المرحلة الأولى من العمود الفقري Swin، يمكنك تعيين `out_indices=(1,)`:
+
+```py
+>>> from transformers import AutoImageProcessor, AutoBackbone
+>>> import torch
+>>> from PIL import Image
+>>> import requests
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+>>> model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+
+>>> inputs = processor(image, return_tensors="pt")
+>>> outputs = model(**inputs)
+>>> feature_maps = outputs.feature_maps
+```
+
+الآن يمكنك الوصول إلى كائن `feature_maps` من المرحلة الأولى من العمود الفقري:
+
+```py
+>>> list(feature_maps[0].shape)
+[1, 96, 56, 56]
+```
+
+## مستخرج الميزات التلقائي (AutoFeatureExtractor)
+
+بالنسبة للمهام الصوتية، يقوم مستخرج الميزات بمعالجة إشارة الصوت إلى تنسيق الإدخال الصحيح.
+
+قم بتحميل مستخرج ميزات باستخدام [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
+...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+## المعالج التلقائي (AutoProcessor)
+
+تتطلب المهام متعددة الوسائط معالجًا يجمع بين نوعين من أدوات المعالجة المسبقة. على سبيل المثال، يتطلب نموذج [LayoutLMV2](model_doc/layoutlmv2) معالج صور لمعالجة الصور ومُجزّئ لمعالجة النص؛ يجمع المعالج كليهما.
+
+قم بتحميل معالج باستخدام [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+```
+
+## النموذج التلقائي (AutoModel)
+
+<frameworkcontent>
+<pt>
+تسمح لك فئات `AutoModelFor` بتحميل نموذج مُدرب مسبقًا لمهمة معينة (راجع [هنا](model_doc/auto) للحصول على قائمة كاملة بالمهام المتاحة). على سبيل المثال، قم بتحميل نموذج لتصنيف التسلسل باستخدام [`AutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+أعد استخدام نفس نقطة التفتيش لتحميل بنية لمهمة مختلفة:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+<Tip warning={true}>
+
+بالنسبة لنماذج PyTorch، تستخدم طريقة `from_pretrained()` `torch.load()` التي تستخدم داخليًا `pickle` والتي يُعرف أنها غير آمنة. بشكل عام، لا تقم مطلقًا بتحميل نموذج قد يكون مصدره مصدرًا غير موثوق به، أو قد يكون تم العبث به. يتم تخفيف هذا الخطر الأمني جزئيًا للنماذج العامة المستضافة على Hub Hugging Face، والتي يتم [فحصها بحثًا عن البرامج الضارة](https://huggingface.co/docs/hub/security-malware) في كل ارتكاب. راجع [توثيق Hub](https://huggingface.co/docs/hub/security) للحصول على أفضل الممارسات مثل [التحقق من التوقيع](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) باستخدام GPG.
+
+لا تتأثر نقاط تفتيش TensorFlow و Flax، ويمكن تحميلها داخل بنيات PyTorch باستخدام `from_tf` و `from_flax` kwargs لطريقة `from_pretrained` للتحايل على هذه المشكلة.
+
+</Tip>
+
+
+بشكل عام، نوصي باستخدام فئة `AutoTokenizer` وفئة `AutoModelFor` لتحميل مثيلات مُدربة مسبقًا من النماذج. سيساعدك هذا في تحميل البنية الصحيحة في كل مرة. في البرنامج التعليمي التالي، تعرف على كيفية استخدام المحلل اللغوي ومعالج الصور ومستخرج الميزات والمعالج الذي تم تحميله حديثًا لمعالجة مجموعة بيانات للضبط الدقيق.
+</pt>
+
+<tf>
+أخيرًا، تسمح لك فئات `TFAutoModelFor` بتحميل نموذج مُدرب مسبقًا لمهمة معينة (راجع [هنا](model_doc/auto) للحصول على قائمة كاملة بالمهام المتاحة). على سبيل المثال، قم بتحميل نموذج لتصنيف التسلسل باستخدام [`TFAutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+أعد استخدام نفس نقطة التفتيش لتحميل بنية لمهمة مختلفة:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+بشكل عام، نوصي باستخدام فئة `AutoTokenizer` وفئة `TFAutoModelFor` لتحميل نسخ لنماذج مُدربة مسبقًا. سيساعدك هذا في تحميل البنية الصحيحة في كل مرة. في البرنامج التعليمي التالي، ستتعرف على كيفية استخدام المُجزّئ اللغوي ومعالج الصور ومستخرج الميزات والمعالج الذي تم تحميله حديثًا لمعالجة مجموعة بيانات للضبط الدقيق.
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ar/conversations.md b/docs/source/ar/conversations.md
new file mode 100644
index 000000000000..00e6fe814ea0
--- /dev/null
+++ b/docs/source/ar/conversations.md
@@ -0,0 +1,204 @@
+# الدردشة مع المحوّلات 
+
+إذا كنت تقرأ هذه المقالة، فمن المؤكد أنك على علم بـ **نماذج الدردشة**. نماذج الدردشة هي أنظمة ذكاء اصطناعي محادثة يمكنك إرسال الرسائل إليه واستقبالها منها. وأشهر هذه النماذج هو ChatGPT الخاص، ولكن هناك الآن العديد من نماذج الدردشة مفتوحة المصدر التي تضاهي أداءه أو حتى تتفوق عليه بشكل كبير. هذه النماذج مجانية للتنزيل والتشغيل على جهاز محلي. على الرغم من أن أكبر النماذج وأكثرها قدرة تتطلب أجهزة عالية الأداء وذاكرة كبيرة لتشغيلها، إلا أن هناك نماذج أصغر ستعمل بشكل جيد تمامًا على وحدة معالجة رسومات (GPU) للمستهلك العادى، أو حتى وحدة المعالجة المركزية (CPU) العادية للكمبيوتر المكتبي أو المحمول.
+
+سيساعدك هذا الدليل على البدء في استخدام نماذج الدردشة. سنبدأ بدليل تشغيل سريع مختصر يستخدم "خط أنابيب" مناسبًا ومختصر. هذا كل ما تحتاجه إذا كنت تريد فقط بدء تشغيل نموذج دردشة على الفور. بعد دليل التشغيل السريع، سننتقل إلى معلومات أكثر تفصيلاً حول ماهية نماذج الدردشة بالضبط، وكيفية اختيار النموذج المناسب، وتحليل تفصيلي لكل خطوة من الخطوات التي تنطوي عليها التحدث إلى نموذج دردشة. كما سنقدم بعض النصائح حول تحسين أداء نموذج الدردشة واستهلاك الذاكرة.
+
+## دليل التشغيل السريع
+
+إذا لم يكن لديك الوقت الكافي للاطلاع على التفاصيل، إليك ملخصًا موجزًا: تستمر نماذج الدردشة في الدردشات. وهذا يعني أنك تمرر لهم سجل محادثة، والذي يمكن أن يكون قصيرًا مثل رسالة مستخدم واحدة، وسيستمر النموذج في المحادثة عن طريق إضافة استجابته. دعونا نرى هذا في العمل. أولاً، دعونا نبني دردشة:
+
+```python
+chat = [
+    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+]
+```
+
+لاحظ أنه بالإضافة إلى رسالة المستخدم، أضفنا رسالة **نظام** في بداية المحادثة. ليس كل نموذج دردشة يدعم رسائل النظام، ولكن عندما تفعل ذلك، فإنها تمثل توجيهات عالية المستوى حول كيفية تصرف النموذج في المحادثة. يمكنك استخدام هذا لتوجيه النموذج - سواء أردت استجابات قصيرة أو طويلة، أو مرحة أو جدية، وهكذا. إذا كنت تريد من النموذج أن يؤدي عملاً مفيدًا بدلاً من ممارسة روتين التحسين، فيمكنك إما حذف رسالة النظام أو تجربة رسالة مختصرة مثل "أنت مساعد ذكي ومفيد يستجيب لاستفسارات المستخدم".
+
+بمجرد أن يكون لديك دردشة، فإن أسرع طريقة لمواصلتها هي استخدام [`TextGenerationPipeline`].
+
+دعونا نرى هذا في العمل مع `LLaMA-3`. لاحظ أن `LLaMA-3` هو نموذج محمي، مما يعني أنه سيتعين عليك [تقديم طلب للحصول على حق الوصول](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) وتسجيل الدخول باستخدام حساب Hugging Face الخاص بك لاستخدامه. سنستخدم أيضًا `device_map="auto"`، والذي سيحمل النموذج على GPU إذا كانت هناك ذاكرة كافية له، ويحدد النوع إلى `torch.bfloat16` لتوفير الذاكرة:
+
+```python
+import torch
+from transformers import pipeline
+
+pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+response = pipe(chat, max_new_tokens=512)
+print(response[0]['generated_text'][-1]['content'])
+```
+
+وستحصل على:
+
+```النص
+(تنهد) أوه يا صديقي، هل تطلب مني النصيحة؟ ستحتاج إلى خريطة، يا صديقي! حسنًا، حسنًا، سأعطيك التفاصيل. لكن لا تقل إنني لم أحذرك، أنا مجرد روبوت، وليس مرشد سياحي!
+
+لذا، تريد أن تعرف ما هي الأشياء الممتعة التي يمكنك القيام بها في التفاحة الكبيرة؟ حسنًا، دعني أخبرك، هناك مليون شيء يمكنك القيام به، لكنني سأعطيك النقاط البارزة. أولاً، عليك أن ترى المعالم السياحية: تمثال الحرية، سنترال بارك، تايمز سكوير... أنت تعرف، فخاخ السياح المعتادة. ولكن إذا كنت تبحث عن شيء أكثر... غير عادي، فأنا أوصي بزيارة متحف الفن الحديث. يحتوي على بعض الأشياء البرية، مثل علب حساء ذلك الرجل وارهول وجميع أنواع الجاز.
+
+وإذا كنت تشعر بروح المغامرة، فاذهب في نزهة على الأقدام عبر جسر بروكلين. ولكن احترس من تلك الحمامات المزعجة، إنها مثل اللصوص الريشيين الصغار! (يضحك) هل فهمت؟ لصوص؟ آه، لا تبالي.
+
+والآن، إذا كنت تبحث عن بعض المرح الجاد، فاذهب إلى نوادي الكوميديا في قرية غرينتش. قد تلقي نظرة خاطفة على بعض الكوميديين الصاعدين... أو مجموعة من الطامحين يحاولون الوصول إلى الشهرة. (يرمش)
+
+وأخيرًا، إذا كنت تشعر بأنك مواطن من نيويورك، فاحصل على شريحة بيتزا من أحد مطاعم البيتزا الرائعة في جميع أنحاء المدينة. فقط لا تحاول طلب شريحة "بحجم الروبوت"، صدقني، لن ينتهي الأمر بشكل جيد. (يضحك)
+
+لذا، هذا هو يا صديقي! هذه هي نصيحتي الخبيرة بشأن ما يجب فعله في نيويورك. والآن، إذا سمحت لي، يجب أن أذهب للاهتمام ببعض الأمور. (يرمش)
+```
+
+يمكنك متابعة الدردشة عن طريق إضافة ردك الخاص إليها.
+يحتوي كائن `response` الذي تم إرجاعه بواسطة خط الأنابيب بالفعل على الدردشة بأكملها حتى الآن، لذا يمكننا ببساطة إضافة رسالة وإعادتها:
+
+```python
+chat = response[0]['generated_text']
+chat.append(
+    {"role": "user", "content": "Wait, what's so wild about soup cans?"}
+)
+response = pipe(chat, max_new_tokens=512)
+print(response[0]['generated_text'][-1]['content'])
+```
+
+وستحصل على:
+
+```النص
+(يضحك) أوه، أنت تقتلني يا صديقي! ألا تفهم، أليس كذلك؟ علب حساء وارهول هي مثل الفن، يا رجل!
+إنه مثل، لقد أخذ شيئًا عاديًا تمامًا، مثل علبة حساء، وحولها إلى تحفة فنية. إنه مثل، "ها أنا ذا، أنا مجرد علبة حساء، لكنني أيضًا عمل فني!"
+(بسخرية) أوه، نعم، أصلي جدًا، آندي.
+
+ولكن، كما تعلم، في الستينيات، كان الأمر بمثابة صفقة كبيرة. كان الناس حريصين على تحدي الوضع الراهن، وكان وارهول مثل ملك ذلك. لقد حول العادي إلى غير عادي.
+واسمح لي أن أخبرك، كان الأمر مثل تغيير اللعبة. أعني، من كان يظن أن علبة الحساء يمكن أن تكون فنا؟ (يضحك)
+
+ولكن، يا صديقي، لست وحدك. أعني، أنا مجرد روبوت، ولا أفهم ذلك أيضًا. (يرمش)
+ولكن، يا صديقي، أليس هذا ما يجعل الفن فنا، أليس كذلك؟ (يضحك)
+```
+
+ستغطي بقية هذا البرنامج التعليمي مواضيع محددة مثل الأداء والذاكرة، أو كيفية اختيار نموذج دردشة يناسب احتياجاتك.
+
+## اختيار نموذج الدردشة
+
+هناك عدد هائل من نماذج الدردشة المختلفة المتاحة على [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending)،
+ويشعر المستخدمون الجدد يشعرون بالارتباك بسبب هذا الكم الهائل من الخيارات المتاحة. لا تقلق من ذلك! كل ما تحتاج إلى التركيز عليه هو اعتباران مهمان:
+- حجم النموذج، والذي سيحدد ما إذا كان يمكنك تحميله في الذاكرة وسرعة تشغيله.
+- جودة ناتج الدردشة للنموذج.
+
+بشكل عام، هذه الأمور مترابطة - النماذج الأكبر تميل إلى أن تكون أكثر قدرة، ولكن حتى مع ذلك هناك اتباين كبير في الأداء بين النماذج ذات الحجم نفسه!
+معنى آخر، حجم النموذج يؤثر بشكل كبير على أدائه، ولكن ليس الحجم هو العامل الوحيد الذي يجب أخذه في الاعتبار.
+
+### الحجم وتسمية النماذج
+من السهل ملاحظة حجم النموذج - فهو الرقم في اسم النموذج، مثل "8B" أو "70B". هذا هو عدد
+**المعلمات** في النموذج. بدون التكميم، يجب أن تتوقع الحاجة إلى حوالي 2 بايت من الذاكرة لكل معلمة.
+هذا يعني أن نموذج "8B" الذي يحتوي على 8 مليارات معلمة سيتطلب حوالي 16 جيجابايت من الذاكرة فقط لتناسب المعلمات،
+بالإضافة إلى القليل من المساحة الإضافية للتكاليف العامة الأخرى. إنه مناسب لوحدة معالجة رسومات (GPU) عالية الجودة للمستهلك بسعة 24 جيجابايت من الذاكرة، مثل 3090
+أو 4090.
+بعض نماذج الدردشة هي نماذج "مزيج من الخبراء". قد يتم سرد أحجام هذه النماذج بطرق مختلفة، مثل "8x7B" أو
+"141B-A35B". الأرقام هنا أكثر ضبابية بعض الشيء، ولكن بشكل عام يمكنك قراءة هذا على أنه يقول إن النموذج
+يحتوي على حوالي 56 (8x7) مليار معلمة في الحالة الأولى، أو 141 مليار معلمة في الحالة الثانية.
+
+لاحظ أنه من الشائع جدًا استخدام تقنيات التكميم لخفض استخدام الذاكرة لكل معلمة إلى 8 بتات أو 4 بتات
+أو حتى أقل. يتم مناقشة هذا الموضوع بمزيد من التفصيل في قسم [اعتبارات الذاكرة](#memory-considerations) أدناه.
+
+### ولكن ما هو أفضل نموذج للدردشة؟
+حتى بعد معرفة حجم نموذج الدردشة الذي يمكنك تشغيله، لا يزال هناك الكثير من الخيارات المتاحة. إحدى الطرق للتنقل في
+كل هذا هو استشارة **لوحات الصدارة**. اثنان من أكثر لوحات الصدارة شهرة هما [OpenLLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
+و [LMSys Chatbot Arena Leaderboard](https://chat.lmsys.org/?leaderboard). لاحظ أن لوحة صدارة LMSys
+تشمل أيضًا نماذج خاصة - انظر إلى عمود `licence` لتحديد النماذج مفتوحة المصدر التي يمكنك تنزيلها، ثم
+ابحث عنها على [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending).
+
+### المجالات المتخصصة
+قد تكون بعض النماذج متخصصة في مجالات معينة، مثل النصوص الطبية أو القانونية، أو اللغات غير الإنجليزية.
+إذا كنت تعمل في هذه المجالات، فقد تجد أن النموذج المتخصص سيمنحك فوائد أداء كبيرة.
+لا تفترض ذلك تلقائيًا! خاصة عندما تكون النماذج المتخصصة أصغر أو أقدم من أحدث التقنيات، فقد يتفوق عليها نموذج عام الغرض رفيع المستوى. لحسن الحظ، بدأنا نرى
+[لوحات الصدارة المتخصصة في المجال](https://huggingface.co/blog/leaderboard-medicalllm) والتي يجب أن تجعل من السهل تحديد موقع أفضل النماذج للمجالات المتخصصة.
+
+## ما الذي يحدث داخل خط الأنابيب؟
+
+استخدم دليل التشغيل السريع أعلاه خط أنابيب عالي المستوى للدردشة مع نموذج دردشة، وهو أمر مريح، ولكنه ليس الأكثر مرونة. دعونا نتخذ نهجًا منخفض المستوى، لكي نرى كل خطوة من الخطوات التي تنطوي عليها الدردشة. دعونا نبدأ
+بعينة من التعليمات البرمجية، ثم نقوم بتفكيكها:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+# إعداد الإدخال كما هو الحال من قبل
+chat = [
+    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+]
+
+# 1: تحميل النموذج والمحلل
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+
+# 2: تطبيق قالب الدردشة
+formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+print("Formatted chat:\n", formatted_chat)
+
+# 3: تحليل الدردشة (يمكن دمج هذه الخطوة مع الخطوة السابقة باستخدام tokenize=True)
+inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False)
+# نقل المدخلات المحللة إلى نفس الجهاز الموجود عليه النموذج (GPU/CPU)
+inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
+print("Tokenized inputs:\n", inputs)
+
+# 4: إنشاء نص من النموذج
+outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1)
+print("Generated tokens:\n", outputs)
+
+# 5: فك تشفير الإخراج مرة أخرى إلى سلسلة
+decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
+print("Decoded output:\n", decoded_output)
+```
+
+هناك الكثير هنا، ويمكن أن تكون كل قطعة وثيقة خاصة بها! بدلاً من الدخول في الكثير من التفاصيل، سأغطي
+الأفكار العامة، وأترك التفاصيل للوثائق المرتبطة بها. الخطوات الرئيسية هي:
+1. يتم تحميل [النماذج](https://huggingface.co/learn/nlp-course/en/chapter2/3) و [المُجزّئات اللغوية](https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt) من Hugging Face Hub.
+2. يتم تنسيق الدردشة باستخدام [قالب الدردشة](https://huggingface.co/docs/transformers/main/en/chat_templating) للمحلل
+3. يتم [تحليل](https://huggingface.co/learn/nlp-course/en/chapter2/4) الدردشة المنسقة باستخدام مُجزّئ اللغوي.
+4. نقوم [بتوليد](https://huggingface.co/docs/transformers/en/llm_tutorial) استجابة من النموذج.
+5. يتم فك تشفير الرموز التي ينتجها النموذج مرة أخرى إلى سلسلة
+
+## الأداء والذاكرة والأجهزة
+
+من المحتمل أنك تعرف الآن أن معظم مهام التعلم الآلي يتم تشغيلها على وحدات معالجة الرسومات (GPU). ومع ذلك، من الممكن تمامًا
+إنشاء نص من نموذج دردشة أو نموذج لغة على وحدة المعالجة المركزية (CPU)، على الرغم من أن ذلك أبطأ إلى حد ما. إذا كان بإمكانك وضع
+النموذج في ذاكرة وحدة معالجة الرسومات (GPU)، فهذا عادة ما يكون الخيار المفضل.
+
+### اعتبارات الذاكرة
+
+بشكل افتراضي، تقوم فئات Hugging Face مثل [`TextGenerationPipeline`] أو [`AutoModelForCausalLM`] بتحميل النموذج في دقة "float32". وهذا يعني أنه يحتاج إلى 4 بايتات (32 بت) لكل معلمة، لذا فإن نموذج "8B" بحجم 8 مليار معلمة سيحتاج إلى ~32 جيجابايت من الذاكرة. ومع ذلك، يمكن أن يكون هذا مضيعة للموارد! يتم تدريب معظم نماذج اللغة الحديثة في دقة "bfloat16"، والتي تستخدم فقط 2 بايت لكل معلمة. إذا كان عتادك يدعم ذلك (Nvidia 30xx/Axxx أو أحدث)، فيمكنك تحميل النموذج في دقة "bfloat16"، باستخدام معامل "torch_dtype" كما فعلنا أعلاه.
+
+ومن الممكن أيضًا النزول إلى أقل من 16 بت باستخدام "التكميم"، وهي طريقة لضغط أوزان النموذج بطريقة تفقد بعض المعلومات. يسمح هذا بضغط كل معلمة إلى 8 بتات أو 4 بتات أو حتى أقل. لاحظ أنه، خاصة في 4 بتات، قد تتأثر جودة ناتج النموذج سلبًا، ولكن غالبًا ما يكون هذا مقايضة تستحق القيام بها لتناسب نموذج محادثة أكبر وأكثر قدرة في الذاكرة. دعنا كيف يمكننا تطبيق ذلك باستخدام مكتبة `bitsandbytes`:
+
+```python
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True) # يمكنك أيضًا تجربة load_in_4bit
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", quantization_config=quantization_config)
+```
+
+أو يمكننا القيام بنفس الشيء باستخدام واجهة برمجة التطبيقات "pipeline":
+
+```python
+from transformers import pipeline, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True) # يمكنك أيضًا تجربة load_in_4bit
+pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
+```
+
+هناك عدة خيارات أخرى لكمية نماذج بخلاف `bitsandbytes` - يرجى الاطلاع على [دليل التكميم](./quantization) لمزيد من المعلومات.
+
+### اعتبارات الأداء
+
+<Tip>
+
+للحصول على دليل أكثر شمولاً حول أداء نموذج اللغة والتحسين، راجع [تحسين استدلال LLM](./llm_optims).
+
+</Tip>
+
+
+كقاعدة عامة، ستكون نماذج المحادثة الأكبر حجمًا أبطأ في توليد النصوص بالإضافة إلى احتياجها لذاكرة أكبرة. من الممكن أن تكون أكثر تحديدًا بشأن هذا: إن توليد النص من نموذج دردشة أمر غير عادي في أنه يخضع لقيود **سعة الذاكرة** بدلاً من قوة الحوسبة، لأن كل معلمة نشطة يجب قراءتها من الذاكرة لكل رمز ينشئه النموذج. وهذا يعني أن عدد الرموز في الثانية التي يمكنك توليدها من نموذج الدردشة يتناسب بشكل عام مع إجمالي حجم الذاكرة التي بوجد بها ا، مقسومًا على حجم النموذج.
+
+في مثالنا السريع أعلاه، كان حجم نموذجنا حوالي 16 جيجابايت عند تحميله في دقة "bfloat16". وهذا يعني أنه يجب قراءة 16 جيجابايت من الذاكرة لكل رمز ينشئه النموذج. يمكن أن يتراوح إجمالي سعة الذاكرة من 20-100 جيجابايت/ثانية لمعالجات المستهلكين إلى 200-900 جيجابايت/ثانية لمعالجات الرسومات للمستهلكين، ومعالجات Intel Xeon أو AMD Threadripper/Epyc أو Apple Silicon المتخصصةة، وأخيرًا يصل إلى 2-3 تيرابايت/ثانية لمعالجات مراكز البيانات مثل Nvidia A100 أو H100. يجب أن يعطيك هذا فكرة جيدة عن سرعة التوليد التي يمكنك توقعها من هذه الأنواع المختلفة من الأجهزة.
+
+لذلك، إذا كنت تريد تحسين سرعة توليد النص، فإن الحل الأسهل هو إما تقليل حجم النموذج في الذاكرة (عادةً عن طريق التكميم)، أو الحصول على عتاد بسرعة أكبر في الذاكرة. بالنسبة للمستخدمين المتقدمين، هناك عدة تقنيات أخرى للتغلب على هذه القيود. الأكثر شيوعًا هي المتغيرات على [التوليد بمساعدة](https://huggingface.co/blog/assisted-generation)، المعروف أيضًا باسم "العينات التخمينية (speculative sampling)". تحاول هذه التقنيات تخمين عدة رموز مستقبلية في وقت واحد، غالبًا باستخدام نموذج "مسودة (draft model)" أصغر، ثم تأكيد هذه التوليدات باستخدام نموذج الدردشة. إذا تم التحقق من صحة التخمينات بواسطة نموذج الدردشة، فيمكن إنشاء أكثر من رمز واحد لكل تمرير للأمام، مما يخفف بشكل كبير من القيود المتعلقة بالسعة ويحسن سرعة التوليد.
+
+أخيرًا، يجب أن نلاحظ أيضًا تأثير نماذج "مزيج الخبراء" "Mixture of Experts" (MoE) هنا. العديد من نماذج المحادثة الشهيرة، مثل Mixtral وQwen-MoE وDBRX، هي نماذج MoE. في هذه النماذج، لا تكون كل معلمة نشطة لكل رمز يتم إنشاؤه. ونتيجة لذلك، فإن نماذج MoE لديها عمومًا متطلبات ذاكرة أقل بكثير، على الرغم من أن حجمها الإجمالي يمكن أن يكون كبيرًا جدًا. لذلك يمكن أن تكون أسرع عدة مرات من نموذج "كثيف" عادي بنفس الحجم. ومع ذلك، فإن التقنيات مثل التوليد المساعد غير فعالة بشكل عام لهذه النماذج لأن المزيد من المعلمات ستصبح نشطة مع كل رمز جديد يتم التكهن به، والذي سيبطل فوائد السعة والسرعة التي توفرها بنية MoE.
\ No newline at end of file
diff --git a/docs/source/ar/glossary.md b/docs/source/ar/glossary.md
new file mode 100644
index 000000000000..81753bad281b
--- /dev/null
+++ b/docs/source/ar/glossary.md
@@ -0,0 +1,446 @@
+# قاموس المصطلحات
+
+يحدد هذا المسرد مصطلحات التعلم الآلي العامة و 🤗 Transformers لمساعدتك على فهم الوثائق بشكل أفضل.
+
+## A
+
+### قناع الانتباه (Attention Mask)
+
+قناع الانتباه هو مُدخل اختياري يستخدم عند تجميع التسلسلات معًا
+
+<Youtube id="M6adb1j2jPI"/>
+
+يشير هذا المُدخل إلى النموذج أى الرموز المميزة (tokens) التي يجب الانتباه إليها، وأيها لا ينبغي ذلك.
+
+على سبيل المثال، تأمّل هذين التسلسُلين :
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+>>> sequence_a = "This is a short sequence."
+>>> sequence_b = "This is a rather long sequence. It is at least longer than sequence A."
+
+>>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
+>>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
+```
+
+لدى الإصدارات المشفرة أطوال مختلفة:
+
+```python
+>>> len(encoded_sequence_a), len(encoded_sequence_b)
+(8, 19)
+```
+
+لذلك، لا يمكننا وضعها معًا في نفس المصفوفة كما هي. يجب إضافة حشو إلى التسلسل الأول حتى يصل إلى طول التسلسل الثاني، أو يجب تقليص الثاني إلى طول الأول.
+
+في الحالة الأولى، يتم تمديد قائمة المعرفات بواسطة مؤشرات الحشو. يمكننا تمرير قائمة إلى المحلل اللغوي وطلب منه إضافة الحشو بهذه الطريقة:
+
+```python
+>>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
+```
+
+يمكننا أن نرى أنه تمت إضافة اصفار على يمين الجملة الأولى لجعلها بنفس طول الجملة الثانية:
+
+```python
+>>> padded_sequences["input_ids"]
+[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
+```
+
+يمكن بعد ذلك تحويل هذا إلى مصفوفة في PyTorch أو TensorFlow. قناع الانتباه هو مصفوفة ثنائية تشير إلى
+موضع  المؤشرات المحشوه بحيث لا ينتبه إليها النموذج. بالنسبة إلى [`BertTokenizer`]`1` يشير إلى
+قيمة يجب الانتباه إليها، في حين يشير `0` إلى قيمة مبطنة. يُمكن إيجاد قناع الانتباه في القاموس الذي يُعيده مُجزِّئ النصوص (tokenizer) تحت المفتاح "attention_mask".
+```python
+>>> padded_sequences["attention_mask"]
+[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+```
+
+### نماذج الترميز التلقائي (autoencoding models)
+
+راجع [نماذج الترميز](#encoder-models) و [نمذجة اللغة المقنعة](#masked-language-modeling-mlm)
+
+### النماذج ذاتية الانحدار (Autoregressive Models)
+
+راجع [نمذجة اللغة السببية](#causal-language-modeling) و [نماذج فك التشفير](#decoder-models)
+
+## B
+
+### العمود الفقري (backbone)
+
+يُمثل العمود الفقري الشبكة العصبونية (الترميزات والطبقات) المسؤولة عن إخراج الحالات الخفية أو المُميزات الأولية. عادة ما يكون متصلاً بـ [رأس](#head) يستقبل المُميزات كمدخلات لإجراء تنبؤ. على سبيل المثال، يُعد النموذج [`ViTModel`] عمودًا فقريًا دون رأس مُحدد مُرفق به. يمكن أيضًا استخدام `ViTModel` كعمود فقري في نماذج أخرى, مثل [DPT](model_doc/dpt).
+
+## C
+
+### نمذجة اللغة السببية (أو التنبؤية) causal language modeling
+
+مهمة ما قبل التدريب يقوم فيها النموذج بقراءة النصوص بالترتيب ويتنبأ بالكلمة التالية. يتم ذلك عادةً من خلال قراءة الجملة كاملةً، ولكن مع استخدام قناع داخل النموذج لإخفاء الرموز المميزة اللاحقة في خطوة زمنية معينة.
+
+
+
+### قناة(channel)
+
+تتكون الصور الملونة من مزيج من القيم في ثلاث قنوات لونية: الأحمر والأخضر والأزرق (RGB) بينما تحتوي صور ذات التدرج رمادي على قناة واحدة فقط. في مكتبة 🤗 Transformers، يمكن أن تكون القناة اللونية البُعد الأول أو الأخير في مُصفوفة الصورة: [`n_channels`، `height`، `width`] أو [`height`، `width`، `n_channels`].
+
+### التصنيف الزمني التوصيلي connectionist temporal classification (CTC)
+
+خوارزمية تسمح للنموذج بالتعلم دون معرفة كيفية محاذاة المدخلات مع المخرجات بدقة؛ يحسب CTC توزيع جميع المخرجات المحتملة لمدخلات مُحددة ويختار المخرج الأكثر احتمالًا. تُستخدم CTC بشكل شائع في مهام التعرف على الكلام نظرًا لأن الكلام المنطوق لا يتوافق دائمًا بشكل مُباشر مع النص المكتوب، لأسباب مختلفة مثل معدلات الكلام المختلفة للمتكلم.
+
+### الالتفاف (Convolution)
+
+نوع من الطبقات في شبكة عصبية، حيث تُضرب مصفوفة الإدخال عُنصرًا بُعنصر بمصفوفة أصغر تُسمى (النواة أو المرشح) ويتم جمع القيم في مصفوفة جديدة. يُعرف هذا باسم عملية الالتفاف التي يتم تكرارها عبر مصفوفة الإدخال بأكملها. تُطبق كل عملية التفاف على جزء مُختلف من مصفوفة الإدخال. تُستخدم الشبكات العصبية الالتفافية (CNNs) بشكل شائع في رؤية الحاسوب.
+
+## D
+
+### التوازي على مستوى البيانات (DataParallel - DP)
+
+هي تقنية تُستخدم لتدريب النماذج على عدة وحدات معالجة رسومات (GPUs)، حيث يتم نسخ نفس إعداد التدريب عدة مرات، بحيث تتلقى كل نسخة شريحة مختلفة من البيانات يتم تنفيذ المعالجة بالتوازي ويتم مزامنة جميع الإعدادات في نهاية كل خطوة تدريب.
+
+تعرف على المزيد حول كيفية عمل DataParallel [هنا](perf_train_gpu_many#dataparallel-vs-distributeddataparallel).
+
+### معرفات مدخلات وحدة فك التشفير (decoder input IDs)
+
+هذا المدخل خاص بنماذج الترميز وفك التشفير، ويحتوي على معرفات الإدخال التي سيتم تغذيتها إلى وحدة فك التشفير.
+يجب استخدام هذه المدخلات لمهام التسلسل إلى التسلسل، مثل الترجمة أو التلخيص، وعادة ما يتم بناؤها بطريقة محددة لكل نموذج.
+
+تقوم معظم نماذج الترميز وفك التشفير (BART، T5) بإنشاء معرفات `decoder_input_ids` الخاصة بها من `labels`. في مثل هذه النماذج،
+يعد تمرير `labels` هو الطريقة المفضلة للتعامل مع التدريب.
+
+يرجى التحقق من وثائق كل نموذج لمعرفة كيفية تعاملها مع معرفات الإدخال هذه للتدريب على التسلسل إلى التسلسل.
+
+### نماذج فك التشفير (decoder models)
+
+يُشار إليها أيضًا باسم نماذج التنبؤية الذاتية، وتنطوي نماذج فك التشفير على مهمة ما قبل التدريب (تسمى نمذجة اللغة السببية) حيث يقرأ النموذج النصوص بالترتيب ويتعين عليه التنبؤ بالكلمة التالية. يتم ذلك عادةً عن طريق
+قراءة الجملة بأكملها مع قناع لإخفاء الرموز المميزة المستقبلية في خطوة زمنية معينة.
+
+<Youtube id="d_ixlCubqQw"/>
+### التعلم العميق deep learning (DL)
+خوارزميات التعلم الآلي التي تستخدم الشبكات العصبية متعددة الطبقات.
+
+## E
+
+### نماذج الترميز (encoder models)
+
+تُعرف أيضًا باسم نماذج الترميز التلقائي، وتأخذ نماذج الترميز إدخالًا (مثل النص أو الصور) وتحويلها إلى تمثيل رقمي مكثف يُطلق عليه الترميز. غالبًا ما يتم تدريب نماذج الترميز مسبقًا باستخدام تقنيات مثل [نمذجة اللغة المقنعة](#masked-language-modeling-mlm)، والتي تقوم بإخفاء أجزاء من تسلسل الإدخال وإجبار النموذج على إنشاء تمثيلات أكثر دلالة (فائدة ووضوحاً).
+
+<Youtube id="H39Z_720T5s"/>
+
+## F
+### استخراج الميزات (feature extraction)
+
+عملية اختيار وتحويل البيانات الأولية إلى مجموعة من الميزات الأكثر إفادة وفائدة لخوارزميات التعلم الآلي. بعض الأمثلة على استخراج الميزات تشمل تحويل النص الأولي/الخام إلى ترميزات الكلمات واستخراج ميزات مهمة مثل الحواف أو الأشكال من بيانات الصور/الفيديو.
+
+### تجزئة التغذية الأمامية (feed forward chunking)
+
+في كل وحدة الانتباه الباقية في المحولات، تلي طبقة الاهتمام الانتباه عادة طبقتان للتغذية الأمامية.
+حجم تضمين الطبقة الأمامية الوسيطة أكبر عادة من حجم المخفي للنموذج (على سبيل المثال، لـ
+`google-bert/bert-base-uncased`).
+بالنسبة لإدخال بحجم `[batch_size, sequence_length]`، يمكن أن تمثل الذاكرة المطلوبة لتخزين التضمينات الأمامية الوسيطة `[batch_size، sequence_length, config.intermediate_size]` جزءًا كبيرًا من استخدام الذاكرة. لاحظ مؤلفو (https://arxiv.org/abs/2001.04451)[Reformer: The Efficient Transformer] أنه نظرًا لأن الحساب مستقل عن بعد `sequence_length`، فإنه من المكافئ رياضيًا حساب تضمينات الإخراج الأمامية `[batch_size، config.hidden_size]_0, ..., [batch_size، `config_size]_n
+فردياً والتوصيل بها لاحقًا إلى `[batch_size, sequence_length, config.hidden_size]` مع `n = sequence_length`، والذي يتداول زيادة وقت الحساب مقابل تقليل استخدام الذاكرة، ولكنه ينتج عنه نتيجة مكافئة رياضيا.
+
+بالنسبة للنماذج التي تستخدم الدالة `[apply_chunking_to_forward]`، يحدد `chunk_size` عدد التضمينات يتم حساب الإخراج بالتوازي وبالتالي يحدد المقايضة بين حجم الذاكرة والتعقيد الوقت. إذا تم تعيين `chunk_size` إلى `0`، فلن يتم إجراء تجزئة التغذية الأمامية.
+
+
+### النماذج المضبوطة (finetuned models)
+
+الضبط الدقيق هو شكل من أشكال نقل التعلم، يتضمن أخذ نموذج مُدرّب مسبقًا، وتجميد أوزانه، واستبدال طبقة الإخراج برأس نموذج مُضاف حديثًا. يتم تدريب رأس النموذج على مجموعة البيانات المستهدفة.
+
+راجع البرنامج التعليمي [Fine-tune a pretrained model](https://huggingface.co/docs/transformers/training) لمزيد من التفاصيل، وتعرف على كيفية ضبط النماذج باستخدام 🤗 Transformers.
+
+## H
+
+### رأس النموذج (head)
+
+يشير رأس النموذج إلى الطبقة الأخيرة من الشبكة العصبية التي تقبل الحالات المخفية الخام/الأولية وتُسقطها على بُعد مختلف. يوجد رأس نموذج مختلف لكل مهمة.
+
+  * [`GPT2ForSequenceClassification`] هو رأس تصنيف تسلسل - طبقة خطية - أعلى نموذج [`GPT2Model`] الأساسي.
+  * [`ViTForImageClassification`] هو رأس تصنيف صورة - طبقة خطية أعلى حالة مخفية نهائية للرمز `CLS` - أعلى نموذج [`ViTModel`] الأساسي.
+  * [`Wav2Vec2ForCTC`] هو رأس نمذجة اللغة مع [CTC](#connectionist-temporal-classification-ctc) أعلى نموذج [`Wav2Vec2Model`] الأساسي.
+
+## I
+
+### رقعة الصور (image patch)
+
+"رقعة الصورة" في نماذج المحولات البصرية، تُقسم الصورة إلى أجزاء أصغر تسمى "رقعات". يتم تمثيل كل رقعة بشكل رقمي (تحويلها إلى مجموعة من الأرقام) ثم تُعالج كسلسلة من البيانات. يمكنك العثور على حجم الرُقعة patch_size - أو دقتها - في إعدادات النموذج.
+
+### الاستدلال (Inference)
+
+الاستدلال هو عملية تقييم نموذج على بيانات جديدة بعد اكتمال التدريب. راجع البرنامج التعليمي [Pipeline for inference](https://huggingface.co/docs/transformers/pipeline_tutorial) لمعرفة كيفية إجراء الاستدلال باستخدام 🤗 Transformers.
+
+### معرفات الإدخال (input IDs)
+
+معرفات الإدخال هي غالبًا المعلمات المطلوبة الوحيدة التي يجب تمريرها إلى النموذج كإدخال. هذه المعرفات عبارة عن أرقام تمثل كل كلمة أو رمز في الجملة التي نريد أن يفهمها النموذج. بمعنى آخر، هي طريقة لترجمة الكلمات إلى أرقام يتم استخدامها كإدخال بواسطة النموذج.
+
+<Youtube id="VFp38yj8h3A"/>
+
+يعمل كل محلل لغوي بشكل مختلف ولكن الآلية الأساسية تبقى كما هي. إليك مثال باستخدام محلل BERT اللغوي، والذي يعد محلل لغوي [WordPiece](https://arxiv.org/pdf/1609.08144.pdf):
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+>>> sequence = "A Titan RTX has 24GB of VRAM"
+```
+
+يتولى المحلل اللغوي مهمة تقسيم التسلسل إلى رموز مميزة متوفرة في قاموس المحلل اللغوي.
+
+```python
+>>> tokenized_sequence = tokenizer.tokenize(sequence)
+```
+
+االرموز إما كلمات أو أجزاء كلمات. هنا على سبيل المثال، لم تكن كلمة "VRAM" موجودة في مفردات النموذج، لذلك تم تقسيمها إلى "V" و "RA" و "M". للإشارة إلى أن هذه الرموز ليست كلمات منفصلة ولكنها أجزاء من نفس الكلمة، تمت إضافة بادئة مزدوجة (#) إلى "RA" و "M":
+```python
+>>> print(tokenized_sequence)
+['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+```
+```python
+>>> print(tokenized_sequence)
+['A'، 'Titan'، 'R'، '##T'، '##X'، 'has'، '24'، '##GB'، 'of'، 'V'، '##RA'، '##M']
+```
+
+يمكن بعد ذلك تحويل هذه الرموز إلى مُعرفات يفهمها النموذج. يمكن القيام بذلك عن طريق تغذية الجملة مباشرةً إلى مُجزّئ الرموز، والذي يستفيد من تنفيذ 🤗 Tokenizers بلغة Rust للحصول على أعلى أداء.
+
+```python
+>>> inputs = tokenizer(sequence)
+```
+
+يقوم المحلل اللغوي بإرجاع قاموس يحتوي على جميع المعلومات التي يحتاجها النموذج للعمل بشكل صحيح. وتوجد مؤشرات الرموز المميزة تحت مفتاح `input_ids`:
+
+```python
+>>> encoded_sequence = inputs["input_ids"]
+>>> print(encoded_sequence)
+[101، 138، 18696، 155، 1942، 3190، 1144، 1572، 13745، 1104، 159، 9664، 2107، 102]
+```
+
+لاحظ أن المحلل اللغوي يضيف تلقائيًا "رموزًا خاصة" (إذا كان النموذج المرتبط يعتمد عليها) وهي معرفات خاصة
+يستخدمها النموذج في بعض الأحيان.
+
+إذا قمنا بفك تشفير التسلسل السابق،
+
+```python
+>>> decoded_sequence = tokenizer.decode(encoded_sequence)
+```
+
+سنرى
+
+```python
+>>> print(decoded_sequence)
+[CLS] A Titan RTX has 24GB of VRAM [SEP]
+```
+
+لأن هذه هي الطريقة التي يتوقع بها نموذج [`BertModel`] إدخالاته.
+
+## L
+
+### االملصقات (Labels)
+
+هي معامل اختياري يمكن إدخاله في النموذج لحساب الخسارة بنفسه.
+نماذج تصنيف التسلسل: ([BertForSequenceClassification]) يتوقع النموذج مصفوفة ذات بعد (batch_size) حيث تتوافق كل قيمة من المجموعة مع الملصق المتوقع للتسلسل بأكمله.
+نماذج تصنيف الرمز: ([BertForTokenClassification]) يتوقع النموذج مصفوفة ذات بعد (batch_size, seq_length) حيث تتوافق كل قيمة مع الملصق المتوقع لكل رمز فردي.
+نماذج النمذجة اللغوية المقنعة:([BertForMaskedLM]) يتوقع النموذج مصفوفة ذات بعد (batch_size, seq_length) حيث تتوافق كل قيمة مع الملصق المتوقع لكل رمز فردي: تكون الملصقات هي معرف رمز الكلمة المقنعة، والقيم الأخرى يتم تجاهلها (عادةً -100).
+مهام التسلسل إلى التسلسل: ([BartForConditionalGeneration], [MBartForConditionalGeneration]) يتوقع النموذج مصفوفة ذات بعد (batch_size, tgt_seq_length) حيث تتوافق كل قيمة مع التسلسل الهدف المرتبط بكل تسلسل مدخل. أثناء التدريب، سيقوم كل من BART و T5 بإنشاء decoder_input_ids و decoder attention masks داخليًا. عادةً لا يلزم توفيرها. هذا لا ينطبق على النماذج التي تستخدم إطار العمل Encoder-Decoder.
+نماذج تصنيف الصور: ([ViTForImageClassification]) يتوقع النموذج مصفوفة ذات بعد (batch_size) حيث تتوافق كل قيمة من المجموعة مع الملصق المتوقع لكل صورة فردية.
+نماذج التقسيم الدلالي: ([SegformerForSemanticSegmentation]) يتوقع النموذج مصفوفة ذات بعد (batch_size, height, width) حيث تتوافق كل قيمة من المجموعة مع الملصق المتوقع لكل بكسل فردي.
+نماذج اكتشاف الأجسام: ([DetrForObjectDetection]) يتوقع النموذج قائمة من القواميس تحتوي على مفتاح class_labels و boxes حيث تتوافق كل قيمة من المجموعة مع الملصق المتوقع وعدد المربعات المحيطة بكل صورة فردية.
+نماذج التعرف التلقائي على الكلام: ([Wav2Vec2ForCTC]) يتوقع النموذج مصفوفة ذات بعد (batch_size, target_length) حيث تتوافق كل قيمة مع الملصق المتوقع لكل رمز فردي.
+
+<Tip>
+
+قد تختلف تسميات كل نموذج، لذا تأكد دائمًا من مراجعة وثائق كل نموذج للحصول على معلومات حول التسميات الخاصة به.
+
+</Tip>
+لا تقبل النماذج الأساسية ([`BertModel`]) الملصقات ، لأنها نماذج المحول الأساسية، والتي تقوم ببساطة بإخراج الميزات.
+
+### نماذج اللغة الكبيرة large language models (LLM)
+
+مصطلح عام يشير إلى نماذج اللغة المحولة (GPT-3 و BLOOM و OPT) التي تم تدريبها على كمية كبيرة من البيانات. تميل هذه النماذج أيضًا إلى وجود عدد كبير من المعلمات القابلة للتعلم (على سبيل المثال، 175 مليار لمعلمة GPT-3).
+
+## M
+
+### نمذجة اللغة المقنعة masked language modeling (MLM)
+
+مهمة تدريب مسبق حيث يرى النموذج نسخة تالفة من النصوص، وعادة ما يتم ذلك عن طريق حجب بعض الرموز بشكل عشوائي، ويتعين على النموذج التنبؤ بالنص الأصلي.
+
+### متعدد الوسائط (multimodal)
+
+مهمة تجمع بين النصوص مع نوع آخر من المدخلات (على سبيل المثال، الصور).
+
+## N
+
+### توليد اللغة الطبيعية Natural language generation (NLG)
+
+جميع المهام المتعلقة بتوليد النص (على سبيل المثال، [اكتب باستخدام المحولات](https://transformer.huggingface.co/)، والترجمة).
+
+### معالجة اللغة الطبيعية Natural language processing (NLP)
+
+طريقة عامة للقول "التعامل مع النصوص".
+
+### فهم اللغة الطبيعية Natural language understanding (NLU)
+
+جميع المهام المتعلقة بفهم ما هو موجود في نص (على سبيل المثال تصنيف النص بأكمله، أو الكلمات الفردية).
+
+## P
+
+### خط الأنابيب (pipeline)
+
+في مكتبة Transformers، يُشير مصطلح "خط الأنابيب" إلى سلسلة من الخطوات التي يتم تنفيذها بترتيب محدد لمعالجة البيانات وتحويلها وإرجاع تنبؤ من نموذج. بعض المراحل الشائعة في خط الأنابيب قد تشمل معالجة البيانات الأولية، واستخراج الميزات، والتوحيد.
+
+للحصول على مزيد من التفاصيل، راجع [خطوط الأنابيب للاستدلال](https://huggingface.co/docs/transformers/pipeline_tutorial).
+
+### التوازي على مستوى خط الأنابيب (PipelineParallel)
+
+تقنية توازي يتم فيها تقسيم النموذج رأسياً (على مستوى الطبقة) عبر وحدات معالجة الرسومات (GPU) متعددة، بحيث توجد طبقة واحدة أو عدة طبقات من النموذج على وحدة معالجة الرسومات (GPU) واحدة فقط. تقوم كل وحدة معالجة رسومات (GPU) بمعالجة مراحل مختلفة من خط الأنابيب بالتوازي والعمل على جزء صغير من الدفعة. تعرف على المزيد حول كيفية عمل PipelineParallel [هنا](perf_train_gpu_many#from-naive-model-parallelism-to-pipeline-parallelism).
+
+### قيم البكسل (pixel values)
+
+مصفوفة من التمثيلات الرقمية لصورة يتم تمريرها إلى نموذج. تأخذ قيم البكسل شكل [`batch_size`، `num_channels`، `height`، `width`]، ويتم إنشاؤها من معالج الصور.
+
+### التجميع (Pooling)
+
+هي عملية تقوم بتقليص مصفوفة إلى مصفوفة أصغر، إما عن طريق أخذ القيمة القصوى أو المتوسط الحسابي للأبعاد التي يتم تجميعها. توجد طبقات التجميع بشكل شائع بين الطبقات التلافيفية convolutional layers لتقليل حجم تمثيل الميزات.
+
+### معرفات الموضع (position IDs)
+
+على عكس الشبكات العصبية المتكررة (RNNs) التي تتضمن موضع كل رمز (token) ضمن بنيتها، لا تدرك المحولات موضع كل رمز. لذلك، تستخدم معرفات الموضع (`position_ids`) من قبل النموذج لتحديد موضع كل رمز في قائمة الرموز.
+
+إنها معلمة اختيارية. إذا لم يتم تمرير أي `position_ids` إلى النموذج، يتم إنشاء المعرفات تلقائيًا كترميزات موضعية مطلقة.
+
+يتم اختيار الترميزات الموضعية المطلقة في النطاق `[0، config.max_position_embeddings - 1]`. تستخدم بعض النماذج أنواعًا أخرى من الترميزات الموضعية، مثل الترميزات الموضعية الجيبية أو الترميزات الموضعية النسبية.
+
+### ما قبل المعالجة (preprocessing) 
+
+مهمة إعداد البيانات الخام بتنسيق يمكن أن تستهلكه نماذج التعلم الآلي بسهولة. على سبيل المثال، عادةً ما تتم معالجة النص مسبقًا عن طريق التمييز. للحصول على فكرة أفضل عن كيفية ظهور المعالجة المسبقة لأنواع الإدخال الأخرى، راجع البرنامج التعليمي [Preprocess](https://huggingface.co/docs/transformers/preprocessing).
+
+### النموذج المسبق التدريب (pretrained model)
+
+نموذج تم تدريبه مسبقًا على بعض البيانات (على سبيل المثال، كل Wikipedia). تنطوي طرق التدريب المسبق على هدف ذاتي الإشراف، والذي يمكن أن يكون قراءة النص ومحاولة التنبؤ بالكلمة التالية ( راجع (causal-language-modeling#)[نمذجة اللغة السببية] ) أو قناع بعض الكلمات ومحاولة التنبؤ بها ( راجع (masked-language#)[نمذجة اللغة المقنعة]- عرض MLM).
+
+لدى نماذج الكلام والرؤية أهدافها التدريبية المسبقة الخاصة. على سبيل المثال، Wav2Vec2 هو نموذج كلام تم تدريبه مسبقًا على مهمة تباينية تتطلب من النموذج تحديد تمثيل الكلام "الحقيقي" من مجموعة من تمثيلات الكلام "الخاطئة". من ناحية أخرى، BEiT هو نموذج رؤية تم تدريبه مسبقًا على مهمة نمذجة صورة مقنعة تقوم بقناع بعض رقع الصورة وتتطلب من النموذج التنبؤ بالرقع المقنعة (مشابهة لهدف نمذجة اللغة المقيدة).
+
+## R
+
+### شبكة عصبية متكررة (RNN)
+
+هي نوع من النماذج التي تستخدم حلقة متكررة فوق طبقة معينة لمعالجة النصوص.
+
+### التعلم التمثيلي (representation learning)
+
+هو فرع من فروع تعلم الآلة يركز على تعلم تمثيلات ذات معنى للبيانات الخام. بعض الأمثلة على تقنيات التعلم التمثيلي تشمل تضمين الكلمات، والمشفرات ذاتية، وشبكات التنافس التوليدية(GANs).
+
+## S
+
+### معدل العينات (sampling rate)
+
+قياس، بالهرتز، لعدد العينات (إشارة الصوت) المأخوذة في الثانية. ينتج معدل العينات عن تمييز إشارة مستمرة مثل الكلام.
+
+### الانتباه الذاتي (Self-Attention)
+
+هو آلية تتيح لكل عنصر في المدخل أن يحدد أي العناصر الأخرى في نفس المدخل يجب أن ينتبه إليها.
+
+### التعلم الذاتي الخاضع للإشراف (supervised learning)
+
+فئة من تقنيات التعلم الآلي التي يقوم فيها النموذج بإنشاء هدفه التعليمي الخاص من البيانات غير الموسومة. يختلف عن [التعلم غير الخاضع للإشراف](#unsupervised-learning) و [التعلم الخاضع للإشراف](#supervised-learning) في أن عملية التعلم خاضعة للإشراف، ولكن ليس صراحة من المستخدم.
+
+مثال واحد على التعلم الذاتي الخاضع للإشراف هو [نمذجة اللغة المقيدة](#masked-language- عرض MLM)، حيث يتم تمرير جمل للنموذج مع إزالة نسبة من رموزه ويتعلم التنبؤ بالرموز المفقودة.
+
+### التعلم شبه الخاضع للإشراف (semi-supervised learning)
+
+فئة واسعة من تقنيات تدريب التعلم الآلي التي تستفيد من كمية صغيرة من البيانات الموسومة مع كمية أكبر من البيانات غير الموسومة لتحسين دقة النموذج، على عكس [التعلم الخاضع للإشراف](#supervised-learning) و [التعلم غير الخاضع للإشراف](#unsupervised-learning).
+
+مثال على نهج التعلم شبه الخاضع للإشراف هو "التدريب الذاتي"، حيث يتم تدريب نموذج على بيانات موسومة، ثم يستخدم لتقديم تنبؤات حول البيانات غير الموسومة. يتم إضافة الجزء من البيانات غير الموسومة التي يتنبأ بها النموذج بأكبر قدر من الثقة إلى مجموعة البيانات الموسومة ويتم استخدامها لإعادة تدريب النموذج.
+
+### تسلسل إلى تسلسل (seq2seq)
+
+نماذج تولد تسلسلًا جديدًا من إدخال، مثل نماذج الترجمة، أو نماذج التلخيص (مثل [Bart](model_doc/bart) أو [T5](model_doc/t5)).
+
+### Sharded DDP
+
+اسم آخر لمفهوم [Zero Redundancy Optimizer](#zero-redundancy-optimizer-zero) الأساسي كما هو مستخدم من قبل العديد من التطبيقات الأخرى لـ Zero.
+
+### الخطوة (Stride)
+
+في العمليات التلافيفية أو التجميعية، تشير الخطوة إلى المسافة التي يتحرك بها النواة (kernel) فوق المصفوفة. خطوة تساوي 1 تعني أن النواة تتحرك بكسل واحد في كل مرة.
+
+### التعلم الخاضع للإشراف (supervised learning)
+
+هو نوع من تدريب النماذج التي تستخدم بيانات مُعلَّمة بشكل مباشر لتصحيح أداء النموذج وتوجيهه. يتم تغذية البيانات إلى النموذج قيد التدريب، ويتم مقارنة تنبؤاته بالنتائج الصحيحة المعروفة. يقوم النموذج بتعديل أوزانه بناءً على مدى خطأ تنبؤاته، وتتكرر هذه العملية لتحسين أداء النموذج.
+
+## T
+
+### توازي Tensor (TP)
+
+تقنية توازي لتدريب وحدات معالجة الرسومات (GPU) متعددة يتم فيها تقسيم المصفوفة إلى عدة أجزاء، لذا بدلاً من وجود المصفوفة بأكملها على وحدة معالجة الرسومات (GPU) واحدة، توجد كل شظية من المصفوفة على وحدة معالجة الرسومات (GPU) المخصصة لها. تتم معالجة الشظايا بشكل منفصل وبالتوازي على وحدات معالجة الرسومات (GPU) المختلفة ويتم مزامنة النتائج في نهاية خطوة المعالجة. هذا ما يُطلق عليه أحيانًا التوازي الأفقي، حيث يحدث الانقسام على المستوى الأفقي.
+
+تعرف على المزيد حول توازي Tensor [هنا](perf_train_gpu_many#tensor-parallelism).
+
+### الرمز اللغوي (Token)
+
+جزء من جملة، عادة ما يكون كلمة، ولكن يمكن أن يكون أيضًا كلمة فرعية (غالبًا ما يتم تقسيم الكلمات غير الشائعة إلى كلمات فرعية) أو علامة ترقيم.
+
+### معرفات نوع الرمز (token type ids)
+
+الغرض من بعض النماذج هو إجراء التصنيف على أزواج من الجمل أو الإجابة على الأسئلة.
+
+<Youtube id="0u3ioSwev3s"/>
+
+يتطلب ذلك تسلسلين مختلفين يتم دمجهما في إدخال "input_ids" واحد، والذي يتم عادةً باستخدام رموز خاصة، مثل رموز التصنيف (`[CLS]`) والفاصل (`[SEP]`). على سبيل المثال، يقوم نموذج BERT ببناء إدخال تسلسلين على النحو التالي:
+
+```python
+>>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+```
+
+يمكننا استخدام برنامجنا للتمييز لإنشاء مثل هذه الجملة تلقائيًا عن طريق تمرير التسلسلين إلى `tokenizer` كمعامليين (وليس قائمة، كما كان من قبل) مثل هذا:
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+>>> sequence_a = "HuggingFace is based in NYC"
+>>> sequence_b = "Where is HuggingFace based?"
+
+>>> encoded_dict = tokenizer(sequence_a، sequence_b)
+>>> decoded = tokenizer.decode(encoded_dict["input_ids"])
+```
+
+والذي سيعيد:
+
+```python
+>>> print(decoded)
+[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based؟ [SEP]
+```
+
+هذا يكفي لبعض النماذج لفهم أين ينتهي تسلسل واحد وأين يبدأ الآخر. ومع ذلك، تستخدم نماذج أخرى، مثل BERT، أيضًا معرفات نوع الرمز (يُطلق عليها أيضًا معرفات الجزء). يتم تمثيلها كماسك ثنائي لتحديد نوعي التسلسل في النموذج.
+
+يعيد برنامج الترميز هذا القناع كإدخال "token_type_ids":
+
+```python
+>>> encoded_dict["token_type_ids"]
+[0، 0، 0، 0، 0، 0، 0، 0، 0، 0، 1، 1، 1، 1، 1، 1، 1، 1، 1]
+```
+
+يتم تمثيل التسلسل الأول، "السياق" المستخدم للسؤال، بجميع رموزه بواسطة `0`، في حين يتم تمثيل التسلسل الثاني، المقابل إلى "السؤال"، بجميع رموزه بواسطة `1`.
+
+تستخدم بعض النماذج، مثل [`XLNetModel`] رمزًا إضافيًا يمثله `2`.
+
+### التعلم الانتقالي (Transfer Learning)
+
+تقنية تنطوي على أخذ نموذج تم تدريبه مسبقًا وتكييفه مع مجموعة بيانات خاصة بمهمتك. بدلاً من تدريب نموذج من الصفر، يمكنك الاستفادة من المعرفة المكتسبة من نموذج موجود كنقطة بداية. يسرع هذا عملية التعلم ويقلل من كمية بيانات التدريب المطلوبة.
+
+### المحول (Transformer)
+
+هو بنية لنموذج تعلم عميق يعتمد على الانتباه الذاتي.
+
+## U
+
+### التعلم غير الخاضع للإشراف (unsupervised learning)
+
+شكل من أشكال تدريب النماذج حيث لا يتم وضع علامات على البيانات المقدمة إلى النموذج. تستفيد تقنيات التعلم غير الخاضعة للإشراف من المعلومات الإحصائية لتوزيع البيانات للعثور على الأنماط المفيدة للمهمة المعنية.
+
+## Z
+
+### محسن التكرار الصفري (ZeRO)
+
+تقنية توازي تقوم بتشظية المصفوفات بطريقة مشابهة لـ [TensorParallel](#tensor-parallelism-tp)، باستثناء إعادة بناء المصفوفة بالكامل في الوقت المناسب لحساب التقدير أو الحساب الخلفي، وبالتالي لا يلزم تعديل النموذج. تدعم هذه الطريقة أيضًا تقنيات الإخلاء المختلفة للتعويض عن ذاكرة GPU المحدودة.
+
+تعرف على المزيد حول Zero [هنا](perf_train_gpu_many#zero-data-parallelism).
diff --git a/docs/source/ar/index.md b/docs/source/ar/index.md
new file mode 100644
index 000000000000..c37dbd1c6d9f
--- /dev/null
+++ b/docs/source/ar/index.md
@@ -0,0 +1,342 @@
+# 🤗 Transformers: لمحة عامة
+
+أحدث ما في مجال التعلم الآلي لـ [PyTorch](https://pytorch.org/) و [TensorFlow](https://www.tensorflow.org/) و [JAX](https://jax.readthedocs.io/en/latest/)
+
+توفر 🤗 Transformers واجهات برمجة التطبيقات (APIs) والأدوات اللازمة لتنزيل وتدريب أحدث النماذج المسبقة التدريب بسهولة. ويمكن أن يقلل استخدام النماذج المسبقة التدريب من تكاليف الحوسبة والحد من الأثر البيئي، وتوفّر الوقت والموارد اللازمين لتدريب نموذج من الصفر. وتدعم هذه النماذج المهام الشائعة في مجالات مختلفة، مثل:
+
+
+📝 **معالجة اللغات الطبيعية**: تصنيف النصوص، وتعريف الكيانات المسماة، والإجابة على الأسئلة، ونمذجة اللغة، والتلخيص، والترجمة، والاختيار من متعدد، وتوليد النصوص. <br>
+🖼️ **الرؤية الحاسوبية**: تصنيف الصور، وكشف الأشياء، وتجزئتها. <br>
+🗣️ **الصوت**: التعرف التلقائي على الكلام، وتصنيف الصوت. <br>
+🐙 **متعدد الوسائط**: الإجابة على الأسئلة الجدولية، والتعرف البصري على الحروف، واستخراج المعلومات من المستندات الممسوحة ضوئيًا، وتصنيف الفيديو، والإجابة على الأسئلة البصرية.
+
+تدعم 🤗 Transformers التوافق بين أطر العمل المختلفة مثل PyTorch و TensorFlow و JAX. ويوفر ذلك المرونة لاستخدام إطار عمل مختلف في كل مرحلة من مراحل حياة النموذج؛ قم بتدريب نموذج في ثلاث خطوط من التعليمات البرمجية في إطار واحد، وقم بتحميله للاستدلال في إطار آخر. ويمكن أيضًا تصدير النماذج إلى صيغ مثل ONNX و TorchScript للنشر في بيئات الإنتاج.
+
+انضم إلى المجتمع المتنامي على [Hub](https://huggingface.co/models) أو [المنتدى](https://discuss.huggingface.co/) أو [Discord](https://discord.com/invite/JfAtkvEtRb) اليوم!
+
+## إذا كنت تبحث عن دعم مخصص من فريق Hugging Face
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## المحتويات
+
+ينقسم التوثيق إلى خمسة أقسام:
+
+- **ابدأ** تقدم جولة سريعة في المكتبة وتعليمات التثبيت للبدء.
+- **الدروس التعليمية** هي مكان رائع للبدء إذا كنت مبتدئًا. سيساعدك هذا القسم على اكتساب المهارات الأساسية التي تحتاجها للبدء في استخدام المكتبة.
+- **أدلة كيفية الاستخدام** تُظهر لك كيفية تحقيق هدف محدد، مثل ضبط نموذج مسبق التدريب لنمذجة اللغة أو كيفية كتابة ومشاركة نموذج مخصص.
+- **الأدلة المفاهيمية** تقدم مناقشة وتفسيرًا أكثر للأفكار والمفاهيم الأساسية وراء النماذج والمهام وفلسفة التصميم في 🤗 Transformers.
+- **واجهة برمجة التطبيقات (API)** تصف جميع الفئات والوظائف:
+
+  - **الفئات الرئيسية** تشرح الفئات الأكثر أهمية مثل التكوين والنمذجة والتحليل النصي وخط الأنابيب.
+  - **النماذج** تشرح الفئات والوظائف المتعلقة بكل نموذج يتم تنفيذه في المكتبة.
+  - **المساعدون الداخليون** يشرحون فئات ووظائف المساعدة التي يتم استخدامها داخليًا.
+
+
+## النماذج والأطر المدعومة
+
+يمثل الجدول أدناه الدعم الحالي في المكتبة لكل من هذه النماذج، وما إذا كان لديها محلل نحوي Python (يُسمى "بطيء"). محلل نحوي "سريع" مدعوم بمكتبة 🤗 Tokenizers، وما إذا كان لديها دعم في Jax (عبر Flax) و/أو PyTorch و/أو TensorFlow.
+
+<!--يتم تحديث هذا الجدول تلقائيًا من الوحدات النمطية التلقائية مع _make fix-copies_. لا تقم بالتحديث يدويًا!-->
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|                                  Model                                   | PyTorch support | TensorFlow support | Flax Support |
+|:------------------------------------------------------------------------:|:---------------:|:------------------:|:------------:|
+|                        [ALBERT](model_doc/albert)                        |       ✅        |         ✅         |      ✅      |
+|                         [ALIGN](model_doc/align)                         |       ✅        |         ❌         |      ❌      |
+|                       [AltCLIP](model_doc/altclip)                       |       ✅        |         ❌         |      ❌      |
+| [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) |       ✅        |         ❌         |      ❌      |
+|                    [Autoformer](model_doc/autoformer)                    |       ✅        |         ❌         |      ❌      |
+|                          [Bark](model_doc/bark)                          |       ✅        |         ❌         |      ❌      |
+|                          [BART](model_doc/bart)                          |       ✅        |         ✅         |      ✅      |
+|                       [BARThez](model_doc/barthez)                       |       ✅        |         ✅         |      ✅      |
+|                       [BARTpho](model_doc/bartpho)                       |       ✅        |         ✅         |      ✅      |
+|                          [BEiT](model_doc/beit)                          |       ✅        |         ❌         |      ✅      |
+|                          [BERT](model_doc/bert)                          |       ✅        |         ✅         |      ✅      |
+|               [Bert Generation](model_doc/bert-generation)               |       ✅        |         ❌         |      ❌      |
+|                 [BertJapanese](model_doc/bert-japanese)                  |       ✅        |         ✅         |      ✅      |
+|                      [BERTweet](model_doc/bertweet)                      |       ✅        |         ✅         |      ✅      |
+|                      [BigBird](model_doc/big_bird)                       |       ✅        |         ❌         |      ✅      |
+|               [BigBird-Pegasus](model_doc/bigbird_pegasus)               |       ✅        |         ❌         |      ❌      |
+|                        [BioGpt](model_doc/biogpt)                        |       ✅        |         ❌         |      ❌      |
+|                           [BiT](model_doc/bit)                           |       ✅        |         ❌         |      ❌      |
+|                    [Blenderbot](model_doc/blenderbot)                    |       ✅        |         ✅         |      ✅      |
+|              [BlenderbotSmall](model_doc/blenderbot-small)               |       ✅        |         ✅         |      ✅      |
+|                          [BLIP](model_doc/blip)                          |       ✅        |         ✅         |      ❌      |
+|                        [BLIP-2](model_doc/blip-2)                        |       ✅        |         ❌         |      ❌      |
+|                         [BLOOM](model_doc/bloom)                         |       ✅        |         ❌         |      ✅      |
+|                          [BORT](model_doc/bort)                          |       ✅        |         ✅         |      ✅      |
+|                   [BridgeTower](model_doc/bridgetower)                   |       ✅        |         ❌         |      ❌      |
+|                          [BROS](model_doc/bros)                          |       ✅        |         ❌         |      ❌      |
+|                          [ByT5](model_doc/byt5)                          |       ✅        |         ✅         |      ✅      |
+|                     [CamemBERT](model_doc/camembert)                     |       ✅        |         ✅         |      ❌      |
+|                        [CANINE](model_doc/canine)                        |       ✅        |         ❌         |      ❌      |
+|                     [Chameleon](model_doc/chameleon)                     |       ✅        |         ❌         |      ❌      |
+|                  [Chinese-CLIP](model_doc/chinese_clip)                  |       ✅        |         ❌         |      ❌      |
+|                          [CLAP](model_doc/clap)                          |       ✅        |         ❌         |      ❌      |
+|                          [CLIP](model_doc/clip)                          |       ✅        |         ✅         |      ✅      |
+|                       [CLIPSeg](model_doc/clipseg)                       |       ✅        |         ❌         |      ❌      |
+|                          [CLVP](model_doc/clvp)                          |       ✅        |         ❌         |      ❌      |
+|                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
+|                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ✅      |
+|                        [Cohere](model_doc/cohere)                        |       ✅        |         ❌         |      ❌      |
+|              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
+|                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
+|                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
+|                    [ConvNeXTV2](model_doc/convnextv2)                    |       ✅        |         ✅         |      ❌      |
+|                           [CPM](model_doc/cpm)                           |       ✅        |         ✅         |      ✅      |
+|                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
+|                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
+|                           [CvT](model_doc/cvt)                           |       ✅        |         ✅         |      ❌      |
+|                           [DAC](model_doc/dac)                           |       ✅        |         ❌         |      ❌      |
+|                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
+|                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
+|                   [Data2VecVision](model_doc/data2vec)                   |       ✅        |         ✅         |      ❌      |
+|                          [DBRX](model_doc/dbrx)                          |       ✅        |         ❌         |      ❌      |
+|                       [DeBERTa](model_doc/deberta)                       |       ✅        |         ✅         |      ❌      |
+|                    [DeBERTa-v2](model_doc/deberta-v2)                    |       ✅        |         ✅         |      ❌      |
+|          [Decision Transformer](model_doc/decision_transformer)          |       ✅        |         ❌         |      ❌      |
+|               [Deformable DETR](model_doc/deformable_detr)               |       ✅        |         ❌         |      ❌      |
+|                          [DeiT](model_doc/deit)                          |       ✅        |         ✅         |      ❌      |
+|                        [DePlot](model_doc/deplot)                        |       ✅        |         ❌         |      ❌      |
+|                [Depth Anything](model_doc/depth_anything)                |       ✅        |         ❌         |      ❌      |
+|                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
+|                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
+|                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
+|                         [DiNAT](model_doc/dinat)                         |       ✅        |         ❌         |      ❌      |
+|                        [DINOv2](model_doc/dinov2)                        |       ✅        |         ❌         |      ✅      |
+|                    [DistilBERT](model_doc/distilbert)                    |       ✅        |         ✅         |      ✅      |
+|                           [DiT](model_doc/dit)                           |       ✅        |         ❌         |      ✅      |
+|                       [DonutSwin](model_doc/donut)                       |       ✅        |         ❌         |      ❌      |
+|                           [DPR](model_doc/dpr)                           |       ✅        |         ✅         |      ❌      |
+|                           [DPT](model_doc/dpt)                           |       ✅        |         ❌         |      ❌      |
+|               [EfficientFormer](model_doc/efficientformer)               |       ✅        |         ✅         |      ❌      |
+|                  [EfficientNet](model_doc/efficientnet)                  |       ✅        |         ❌         |      ❌      |
+|                       [ELECTRA](model_doc/electra)                       |       ✅        |         ✅         |      ✅      |
+|                       [EnCodec](model_doc/encodec)                       |       ✅        |         ❌         |      ❌      |
+|               [Encoder decoder](model_doc/encoder-decoder)               |       ✅        |         ✅         |      ✅      |
+|                         [ERNIE](model_doc/ernie)                         |       ✅        |         ❌         |      ❌      |
+|                       [ErnieM](model_doc/ernie_m)                        |       ✅        |         ❌         |      ❌      |
+|                           [ESM](model_doc/esm)                           |       ✅        |         ✅         |      ❌      |
+|              [FairSeq Machine-Translation](model_doc/fsmt)               |       ✅        |         ❌         |      ❌      |
+|                        [Falcon](model_doc/falcon)                        |       ✅        |         ❌         |      ❌      |
+|                  [FalconMamba](model_doc/falcon_mamba)                   |       ✅        |         ❌         |      ❌      |
+|         [FastSpeech2Conformer](model_doc/fastspeech2_conformer)          |       ✅        |         ❌         |      ❌      |
+|                       [FLAN-T5](model_doc/flan-t5)                       |       ✅        |         ✅         |      ✅      |
+|                      [FLAN-UL2](model_doc/flan-ul2)                      |       ✅        |         ✅         |      ✅      |
+|                      [FlauBERT](model_doc/flaubert)                      |       ✅        |         ✅         |      ❌      |
+|                         [FLAVA](model_doc/flava)                         |       ✅        |         ❌         |      ❌      |
+|                          [FNet](model_doc/fnet)                          |       ✅        |         ❌         |      ❌      |
+|                      [FocalNet](model_doc/focalnet)                      |       ✅        |         ❌         |      ❌      |
+|                  [Funnel Transformer](model_doc/funnel)                  |       ✅        |         ✅         |      ❌      |
+|                          [Fuyu](model_doc/fuyu)                          |       ✅        |         ❌         |      ❌      |
+|                         [Gemma](model_doc/gemma)                         |       ✅        |         ❌         |      ✅      |
+|                        [Gemma2](model_doc/gemma2)                        |       ✅        |         ❌         |      ❌      |
+|                           [GIT](model_doc/git)                           |       ✅        |         ❌         |      ❌      |
+|                          [GLPN](model_doc/glpn)                          |       ✅        |         ❌         |      ❌      |
+|                       [GPT Neo](model_doc/gpt_neo)                       |       ✅        |         ❌         |      ✅      |
+|                      [GPT NeoX](model_doc/gpt_neox)                      |       ✅        |         ❌         |      ❌      |
+|             [GPT NeoX Japanese](model_doc/gpt_neox_japanese)             |       ✅        |         ❌         |      ❌      |
+|                         [GPT-J](model_doc/gptj)                          |       ✅        |         ✅         |      ✅      |
+|                       [GPT-Sw3](model_doc/gpt-sw3)                       |       ✅        |         ✅         |      ✅      |
+|                   [GPTBigCode](model_doc/gpt_bigcode)                    |       ✅        |         ❌         |      ❌      |
+|               [GPTSAN-japanese](model_doc/gptsan-japanese)               |       ✅        |         ❌         |      ❌      |
+|                       [Granite](model_doc/granite)                       |       ✅        |         ❌         |      ❌      |
+|                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
+|                [Grounding DINO](model_doc/grounding-dino)                |       ✅        |         ❌         |      ❌      |
+|                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
+|                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
+|                         [Hiera](model_doc/hiera)                         |       ✅        |         ❌         |      ❌      |
+|                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
+|                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
+|                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ✅         |      ❌      |
+|                      [Idefics2](model_doc/idefics2)                      |       ✅        |         ❌         |      ❌      |
+|                      [ImageGPT](model_doc/imagegpt)                      |       ✅        |         ❌         |      ❌      |
+|                      [Informer](model_doc/informer)                      |       ✅        |         ❌         |      ❌      |
+|                  [InstructBLIP](model_doc/instructblip)                  |       ✅        |         ❌         |      ❌      |
+|             [InstructBlipVideo](model_doc/instructblipvideo)             |       ✅        |         ❌         |      ❌      |
+|                         [Jamba](model_doc/jamba)                         |       ✅        |         ❌         |      ❌      |
+|                        [JetMoe](model_doc/jetmoe)                        |       ✅        |         ❌         |      ❌      |
+|                       [Jukebox](model_doc/jukebox)                       |       ✅        |         ❌         |      ❌      |
+|                      [KOSMOS-2](model_doc/kosmos-2)                      |       ✅        |         ❌         |      ❌      |
+|                      [LayoutLM](model_doc/layoutlm)                      |       ✅        |         ✅         |      ❌      |
+|                    [LayoutLMv2](model_doc/layoutlmv2)                    |       ✅        |         ❌         |      ❌      |
+|                    [LayoutLMv3](model_doc/layoutlmv3)                    |       ✅        |         ✅         |      ❌      |
+|                     [LayoutXLM](model_doc/layoutxlm)                     |       ✅        |         ❌         |      ❌      |
+|                           [LED](model_doc/led)                           |       ✅        |         ✅         |      ❌      |
+|                         [LeViT](model_doc/levit)                         |       ✅        |         ❌         |      ❌      |
+|                          [LiLT](model_doc/lilt)                          |       ✅        |         ❌         |      ❌      |
+|                         [LLaMA](model_doc/llama)                         |       ✅        |         ❌         |      ✅      |
+|                        [Llama2](model_doc/llama2)                        |       ✅        |         ❌         |      ✅      |
+|                        [Llama3](model_doc/llama3)                        |       ✅        |         ❌         |      ✅      |
+|                         [LLaVa](model_doc/llava)                         |       ✅        |         ❌         |      ❌      |
+|                    [LLaVA-NeXT](model_doc/llava_next)                    |       ✅        |         ❌         |      ❌      |
+|              [LLaVa-NeXT-Video](model_doc/llava_next_video)              |       ✅        |         ❌         |      ❌      |
+|                    [Longformer](model_doc/longformer)                    |       ✅        |         ✅         |      ❌      |
+|                        [LongT5](model_doc/longt5)                        |       ✅        |         ❌         |      ✅      |
+|                          [LUKE](model_doc/luke)                          |       ✅        |         ❌         |      ❌      |
+|                        [LXMERT](model_doc/lxmert)                        |       ✅        |         ✅         |      ❌      |
+|                        [M-CTC-T](model_doc/mctct)                        |       ✅        |         ❌         |      ❌      |
+|                       [M2M100](model_doc/m2m_100)                        |       ✅        |         ❌         |      ❌      |
+|                    [MADLAD-400](model_doc/madlad-400)                    |       ✅        |         ✅         |      ✅      |
+|                         [Mamba](model_doc/mamba)                         |       ✅        |         ❌         |      ❌      |
+|                        [mamba2](model_doc/mamba2)                        |       ✅        |         ❌         |      ❌      |
+|                        [Marian](model_doc/marian)                        |       ✅        |         ✅         |      ✅      |
+|                      [MarkupLM](model_doc/markuplm)                      |       ✅        |         ❌         |      ❌      |
+|                   [Mask2Former](model_doc/mask2former)                   |       ✅        |         ❌         |      ❌      |
+|                    [MaskFormer](model_doc/maskformer)                    |       ✅        |         ❌         |      ❌      |
+|                        [MatCha](model_doc/matcha)                        |       ✅        |         ❌         |      ❌      |
+|                         [mBART](model_doc/mbart)                         |       ✅        |         ✅         |      ✅      |
+|                      [mBART-50](model_doc/mbart50)                       |       ✅        |         ✅         |      ✅      |
+|                          [MEGA](model_doc/mega)                          |       ✅        |         ❌         |      ❌      |
+|                 [Megatron-BERT](model_doc/megatron-bert)                 |       ✅        |         ❌         |      ❌      |
+|                 [Megatron-GPT2](model_doc/megatron_gpt2)                 |       ✅        |         ✅         |      ✅      |
+|                       [MGP-STR](model_doc/mgp-str)                       |       ✅        |         ❌         |      ❌      |
+|                       [Mistral](model_doc/mistral)                       |       ✅        |         ✅         |      ✅      |
+|                       [Mixtral](model_doc/mixtral)                       |       ✅        |         ❌         |      ❌      |
+|                         [mLUKE](model_doc/mluke)                         |       ✅        |         ❌         |      ❌      |
+|                           [MMS](model_doc/mms)                           |       ✅        |         ✅         |      ✅      |
+|                    [MobileBERT](model_doc/mobilebert)                    |       ✅        |         ✅         |      ❌      |
+|                  [MobileNetV1](model_doc/mobilenet_v1)                   |       ✅        |         ❌         |      ❌      |
+|                  [MobileNetV2](model_doc/mobilenet_v2)                   |       ✅        |         ❌         |      ❌      |
+|                     [MobileViT](model_doc/mobilevit)                     |       ✅        |         ✅         |      ❌      |
+|                   [MobileViTV2](model_doc/mobilevitv2)                   |       ✅        |         ❌         |      ❌      |
+|                         [MPNet](model_doc/mpnet)                         |       ✅        |         ✅         |      ❌      |
+|                           [MPT](model_doc/mpt)                           |       ✅        |         ❌         |      ❌      |
+|                           [MRA](model_doc/mra)                           |       ✅        |         ❌         |      ❌      |
+|                           [MT5](model_doc/mt5)                           |       ✅        |         ✅         |      ✅      |
+|                      [MusicGen](model_doc/musicgen)                      |       ✅        |         ❌         |      ❌      |
+|               [MusicGen Melody](model_doc/musicgen_melody)               |       ✅        |         ❌         |      ❌      |
+|                           [MVP](model_doc/mvp)                           |       ✅        |         ❌         |      ❌      |
+|                           [NAT](model_doc/nat)                           |       ✅        |         ❌         |      ❌      |
+|                      [Nemotron](model_doc/nemotron)                      |       ✅        |         ❌         |      ❌      |
+|                         [Nezha](model_doc/nezha)                         |       ✅        |         ❌         |      ❌      |
+|                          [NLLB](model_doc/nllb)                          |       ✅        |         ❌         |      ❌      |
+|                      [NLLB-MOE](model_doc/nllb-moe)                      |       ✅        |         ❌         |      ❌      |
+|                        [Nougat](model_doc/nougat)                        |       ✅        |         ✅         |      ✅      |
+|                 [Nyströmformer](model_doc/nystromformer)                 |       ✅        |         ❌         |      ❌      |
+|                          [OLMo](model_doc/olmo)                          |       ✅        |         ❌         |      ❌      |
+|                     [OneFormer](model_doc/oneformer)                     |       ✅        |         ❌         |      ❌      |
+|                    [OpenAI GPT](model_doc/openai-gpt)                    |       ✅        |         ✅         |      ❌      |
+|                      [OpenAI GPT-2](model_doc/gpt2)                      |       ✅        |         ✅         |      ✅      |
+|                    [OpenLlama](model_doc/open-llama)                     |       ✅        |         ❌         |      ❌      |
+|                           [OPT](model_doc/opt)                           |       ✅        |         ✅         |      ✅      |
+|                       [OWL-ViT](model_doc/owlvit)                        |       ✅        |         ❌         |      ❌      |
+|                         [OWLv2](model_doc/owlv2)                         |       ✅        |         ❌         |      ❌      |
+|                     [PaliGemma](model_doc/paligemma)                     |       ✅        |         ❌         |      ❌      |
+|                  [PatchTSMixer](model_doc/patchtsmixer)                  |       ✅        |         ❌         |      ❌      |
+|                      [PatchTST](model_doc/patchtst)                      |       ✅        |         ❌         |      ❌      |
+|                       [Pegasus](model_doc/pegasus)                       |       ✅        |         ✅         |      ✅      |
+|                     [PEGASUS-X](model_doc/pegasus_x)                     |       ✅        |         ❌         |      ❌      |
+|                     [Perceiver](model_doc/perceiver)                     |       ✅        |         ❌         |      ❌      |
+|                     [Persimmon](model_doc/persimmon)                     |       ✅        |         ❌         |      ❌      |
+|                           [Phi](model_doc/phi)                           |       ✅        |         ❌         |      ❌      |
+|                          [Phi3](model_doc/phi3)                          |       ✅        |         ❌         |      ❌      |
+|                       [PhoBERT](model_doc/phobert)                       |       ✅        |         ✅         |      ✅      |
+|                    [Pix2Struct](model_doc/pix2struct)                    |       ✅        |         ❌         |      ❌      |
+|                        [PLBart](model_doc/plbart)                        |       ✅        |         ❌         |      ❌      |
+|                    [PoolFormer](model_doc/poolformer)                    |       ✅        |         ❌         |      ❌      |
+|                     [Pop2Piano](model_doc/pop2piano)                     |       ✅        |         ❌         |      ❌      |
+|                    [ProphetNet](model_doc/prophetnet)                    |       ✅        |         ❌         |      ❌      |
+|                           [PVT](model_doc/pvt)                           |       ✅        |         ❌         |      ❌      |
+|                        [PVTv2](model_doc/pvt_v2)                         |       ✅        |         ❌         |      ❌      |
+|                       [QDQBert](model_doc/qdqbert)                       |       ✅        |         ❌         |      ❌      |
+|                         [Qwen2](model_doc/qwen2)                         |       ✅        |         ❌         |      ❌      |
+|                   [Qwen2Audio](model_doc/qwen2_audio)                    |       ✅        |         ❌         |      ❌      |
+|                     [Qwen2MoE](model_doc/qwen2_moe)                      |       ✅        |         ❌         |      ❌      |
+|                      [Qwen2VL](model_doc/qwen2_vl)                       |       ✅        |         ❌         |      ❌      |
+|                           [RAG](model_doc/rag)                           |       ✅        |         ✅         |      ❌      |
+|                         [REALM](model_doc/realm)                         |       ✅        |         ❌         |      ❌      |
+|               [RecurrentGemma](model_doc/recurrent_gemma)                |       ✅        |         ❌         |      ❌      |
+|                      [Reformer](model_doc/reformer)                      |       ✅        |         ❌         |      ❌      |
+|                        [RegNet](model_doc/regnet)                        |       ✅        |         ✅         |      ✅      |
+|                       [RemBERT](model_doc/rembert)                       |       ✅        |         ✅         |      ❌      |
+|                        [ResNet](model_doc/resnet)                        |       ✅        |         ✅         |      ✅      |
+|                     [RetriBERT](model_doc/retribert)                     |       ✅        |         ❌         |      ❌      |
+|                       [RoBERTa](model_doc/roberta)                       |       ✅        |         ✅         |      ✅      |
+|          [RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)          |       ✅        |         ✅         |      ✅      |
+|                      [RoCBert](model_doc/roc_bert)                       |       ✅        |         ❌         |      ❌      |
+|                      [RoFormer](model_doc/roformer)                      |       ✅        |         ✅         |      ✅      |
+|                       [RT-DETR](model_doc/rt_detr)                       |       ✅        |         ❌         |      ❌      |
+|                [RT-DETR-ResNet](model_doc/rt_detr_resnet)                |       ✅        |         ❌         |      ❌      |
+|                          [RWKV](model_doc/rwkv)                          |       ✅        |         ❌         |      ❌      |
+|                           [SAM](model_doc/sam)                           |       ✅        |         ✅         |      ❌      |
+|                  [SeamlessM4T](model_doc/seamless_m4t)                   |       ✅        |         ❌         |      ❌      |
+|                [SeamlessM4Tv2](model_doc/seamless_m4t_v2)                |       ✅        |         ❌         |      ❌      |
+|                     [SegFormer](model_doc/segformer)                     |       ✅        |         ✅         |      ❌      |
+|                        [SegGPT](model_doc/seggpt)                        |       ✅        |         ❌         |      ❌      |
+|                           [SEW](model_doc/sew)                           |       ✅        |         ❌         |      ❌      |
+|                         [SEW-D](model_doc/sew-d)                         |       ✅        |         ❌         |      ❌      |
+|                        [SigLIP](model_doc/siglip)                        |       ✅        |         ❌         |      ❌      |
+|        [Speech Encoder decoder](model_doc/speech-encoder-decoder)        |       ✅        |         ❌         |      ✅      |
+|                 [Speech2Text](model_doc/speech_to_text)                  |       ✅        |         ✅         |      ❌      |
+|                      [SpeechT5](model_doc/speecht5)                      |       ✅        |         ❌         |      ❌      |
+|                      [Splinter](model_doc/splinter)                      |       ✅        |         ❌         |      ❌      |
+|                   [SqueezeBERT](model_doc/squeezebert)                   |       ✅        |         ❌         |      ❌      |
+|                      [StableLm](model_doc/stablelm)                      |       ✅        |         ❌         |      ❌      |
+|                    [Starcoder2](model_doc/starcoder2)                    |       ✅        |         ❌         |      ❌      |
+|                    [SuperPoint](model_doc/superpoint)                    |       ✅        |         ❌         |      ❌      |
+|                   [SwiftFormer](model_doc/swiftformer)                   |       ✅        |         ✅         |      ❌      |
+|                    [Swin Transformer](model_doc/swin)                    |       ✅        |         ✅         |      ❌      |
+|                 [Swin Transformer V2](model_doc/swinv2)                  |       ✅        |         ❌         |      ❌      |
+|                       [Swin2SR](model_doc/swin2sr)                       |       ✅        |         ❌         |      ❌      |
+|           [SwitchTransformers](model_doc/switch_transformers)            |       ✅        |         ❌         |      ❌      |
+|                            [T5](model_doc/t5)                            |       ✅        |         ✅         |      ✅      |
+|                        [T5v1.1](model_doc/t5v1.1)                        |       ✅        |         ✅         |      ✅      |
+|             [Table Transformer](model_doc/table-transformer)             |       ✅        |         ❌         |      ❌      |
+|                         [TAPAS](model_doc/tapas)                         |       ✅        |         ✅         |      ❌      |
+|                         [TAPEX](model_doc/tapex)                         |       ✅        |         ✅         |      ✅      |
+|       [Time Series Transformer](model_doc/time_series_transformer)       |       ✅        |         ❌         |      ❌      |
+|                   [TimeSformer](model_doc/timesformer)                   |       ✅        |         ❌         |      ❌      |
+|        [Trajectory Transformer](model_doc/trajectory_transformer)        |       ✅        |         ❌         |      ❌      |
+|                  [Transformer-XL](model_doc/transfo-xl)                  |       ✅        |         ✅         |      ❌      |
+|                         [TrOCR](model_doc/trocr)                         |       ✅        |         ❌         |      ❌      |
+|                          [TVLT](model_doc/tvlt)                          |       ✅        |         ❌         |      ❌      |
+|                           [TVP](model_doc/tvp)                           |       ✅        |         ❌         |      ❌      |
+|                          [UDOP](model_doc/udop)                          |       ✅        |         ❌         |      ❌      |
+|                           [UL2](model_doc/ul2)                           |       ✅        |         ✅         |      ✅      |
+|                          [UMT5](model_doc/umt5)                          |       ✅        |         ❌         |      ❌      |
+|                     [UniSpeech](model_doc/unispeech)                     |       ✅        |         ❌         |      ❌      |
+|                 [UniSpeechSat](model_doc/unispeech-sat)                  |       ✅        |         ❌         |      ❌      |
+|                       [UnivNet](model_doc/univnet)                       |       ✅        |         ❌         |      ❌      |
+|                       [UPerNet](model_doc/upernet)                       |       ✅        |         ❌         |      ❌      |
+|                           [VAN](model_doc/van)                           |       ✅        |         ❌         |      ❌      |
+|                   [VideoLlava](model_doc/video_llava)                    |       ✅        |         ❌         |      ❌      |
+|                      [VideoMAE](model_doc/videomae)                      |       ✅        |         ❌         |      ❌      |
+|                          [ViLT](model_doc/vilt)                          |       ✅        |         ❌         |      ❌      |
+|                      [VipLlava](model_doc/vipllava)                      |       ✅        |         ❌         |      ❌      |
+|        [Vision Encoder decoder](model_doc/vision-encoder-decoder)        |       ✅        |         ✅         |      ✅      |
+|       [VisionTextDualEncoder](model_doc/vision-text-dual-encoder)        |       ✅        |         ✅         |      ✅      |
+|                   [VisualBERT](model_doc/visual_bert)                    |       ✅        |         ❌         |      ❌      |
+|                           [ViT](model_doc/vit)                           |       ✅        |         ✅         |      ✅      |
+|                    [ViT Hybrid](model_doc/vit_hybrid)                    |       ✅        |         ❌         |      ❌      |
+|                        [VitDet](model_doc/vitdet)                        |       ✅        |         ❌         |      ❌      |
+|                       [ViTMAE](model_doc/vit_mae)                        |       ✅        |         ✅         |      ❌      |
+|                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
+|                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
+|                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
+|                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
+|                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |
+|                 [Wav2Vec2-BERT](model_doc/wav2vec2-bert)                 |       ✅        |         ❌         |      ❌      |
+|            [Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)            |       ✅        |         ❌         |      ❌      |
+|              [Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)               |       ✅        |         ✅         |      ✅      |
+|                         [WavLM](model_doc/wavlm)                         |       ✅        |         ❌         |      ❌      |
+|                       [Whisper](model_doc/whisper)                       |       ✅        |         ✅         |      ✅      |
+|                        [X-CLIP](model_doc/xclip)                         |       ✅        |         ❌         |      ❌      |
+|                         [X-MOD](model_doc/xmod)                          |       ✅        |         ❌         |      ❌      |
+|                          [XGLM](model_doc/xglm)                          |       ✅        |         ✅         |      ✅      |
+|                           [XLM](model_doc/xlm)                           |       ✅        |         ✅         |      ❌      |
+|                [XLM-ProphetNet](model_doc/xlm-prophetnet)                |       ✅        |         ❌         |      ❌      |
+|                   [XLM-RoBERTa](model_doc/xlm-roberta)                   |       ✅        |         ✅         |      ✅      |
+|                [XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)                |       ✅        |         ❌         |      ❌      |
+|                         [XLM-V](model_doc/xlm-v)                         |       ✅        |         ✅         |      ✅      |
+|                         [XLNet](model_doc/xlnet)                         |       ✅        |         ✅         |      ❌      |
+|                         [XLS-R](model_doc/xls_r)                         |       ✅        |         ✅         |      ✅      |
+|                 [XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)                 |       ✅        |         ✅         |      ✅      |
+|                         [YOLOS](model_doc/yolos)                         |       ✅        |         ❌         |      ❌      |
+|                          [YOSO](model_doc/yoso)                          |       ✅        |         ❌         |      ❌      |
+|                      [ZoeDepth](model_doc/zoedepth)                      |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/source/ar/installation.md b/docs/source/ar/installation.md
new file mode 100644
index 000000000000..ac5962ec8589
--- /dev/null
+++ b/docs/source/ar/installation.md
@@ -0,0 +1,246 @@
+# التثبيت (Installation)
+
+قم بتثبيت مكتبة 🤗 Transformers المناسبة لمكتبة التعلم العميق التي تستخدمها، وقم بإعداد ذاكرة التخزين المؤقت الخاصة بك، وقم بإعداد 🤗 Transformers للعمل دون اتصال بالإنترنت (اختياري).
+
+تم اختبار 🤗 Transformers على Python 3.6  والإصدارات الأحدث، وPyTorch 1.1.0 والإصدارات الأحدث، وTensorFlow 2.0 والإصدارات الأحدث، وFlax. اتبع تعليمات التثبيت أدناه لمكتبة التعلم العميق التي تستخدمها:
+
+* تعليمات تثبيت [PyTorch](https://pytorch.org/get-started/locally/).
+* تعليمات تثبيت [TensorFlow 2.0](https://www.tensorflow.org/install/pip).
+* تعليمات تثبيت [Flax](https://flax.readthedocs.io/en/latest/).
+
+## التثبيت باستخدام pip
+
+يجب عليك تثبيت 🤗 Transformers داخل [بيئة افتراضية](https://docs.python.org/3/library/venv.html). إذا لم تكن غير ملم ببيئات Python الافتراضية، فراجع هذا [الدليل](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). البيئة الافتراضية تسهل إدارة المشاريع المختلف، وتجنب مشكلات التوافق بين المكتبات المطلوبة (اعتماديات المشروع).
+
+ابدأ بإنشاء بيئة افتراضية في دليل مشروعك:
+
+```bash
+python -m venv .env
+```
+
+قم بتفعيل البيئة الافتراضية. على Linux وMacOs:
+
+```bash
+source .env/bin/activate
+```
+
+قم بتفعيل البيئة الافتراضية على Windows:
+
+```bash
+.env/Scripts/activate
+```
+
+الآن أنت مستعد لتثبيت 🤗 Transformers باستخدام الأمر التالي:
+
+```bash
+pip install transformers
+```
+
+للحصول على الدعم الخاص بـ CPU فقط، يمكنك تثبيت 🤗 Transformers ومكتبة التعلم العميق في خطوة واحدة. على سبيل المثال، قم بتثبيت 🤗 Transformers وPyTorch باستخدام:
+
+```bash
+pip install 'transformers[torch]'
+```
+
+🤗 Transformers وTensorFlow 2.0:
+
+```bash
+pip install 'transformers[tf-cpu]'
+```
+
+<Tip warning={true}>
+
+لمستخدمي M1 / ARM
+
+ستحتاج إلى تثبيت ما يلي قبل تثبيت TensorFLow 2.0
+```bash
+brew install cmake
+brew install pkg-config
+```
+
+</Tip>
+
+🤗 Transformers وFlax:
+
+```bash
+pip install 'transformers[flax]'
+```
+
+أخيرًا، تحقق مما إذا كان 🤗 Transformers قد تم تثبيته بشكل صحيح عن طريق تشغيل الأمر التالي. سيقوم بتنزيل نموذج مدرب مسبقًا:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+ثم قم بطباعة التسمية والنتيجة:
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## التثبيت من المصدر
+
+قم بتثبيت 🤗 Transformers من المصدر باستخدام الأمر التالي:
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+يقوم هذا الأمر بتثبيت  أحدث إصدار تجريبي `main`  بدلاً من الإصدار المستقر `stable`. يعد إصدار `main` مفيدًا للمواكبة مع أحدث التطورات. على سبيل المثال، إذا تم إصلاح خطأ منذ الإصدار الرسمي الأخير ولكن لم يتم طرح إصدار جديد بعد. ومع ذلك، فإن هذا يعني أن إصدار التجريبي `main` قد لا يكون مستقرًا دائمًا. نسعى جاهدين للحفاظ على تشغيل إصدار `main`، ويتم حل معظم المشكلات عادةً في غضون بضع ساعات أو يوم. إذا واجهتك مشكلة، يرجى فتح [تقرير عن خلل](https://github.com/huggingface/transformers/issues) حتى نتمكن من إصلاحها في أقرب وقت ممكن!
+
+تحقق مما إذا كان 🤗 Transformers قد تم تثبيته بشكل صحيح عن طريق تشغيل الأمر التالي:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+تحقق مما إذا كان 🤗 Transformers قد تم تثبيته بشكل صحيح عن طريق تشغيل الأمر التالي:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## التثبيت القابل للتعديل
+
+ستحتاج إلى تثبيت قابل للتعديل إذا كنت ترغب في:
+
+* استخدام إصدار `main` من كود المصدر.
+* المساهمة في 🤗 Transformers وتحتاج إلى اختبار التغييرات في الكود.
+
+قم باستنساخ المستودع وقم بتثبيت 🤗 Transformers باستخدام الأوامر التالية:
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+ ستقوم هذه الأوامر بربط المجلد الذي قمت باستنساخ المستودع فيه بمسارات مكتبة Python. بمعنى آخر، سيبحث Python داخل المجلد الذي قمت باستنساخه بالإضافة إلى المسارات المعتادة للمكتبات. على سبيل المثال، إذا تم تثبيت حزم Python الخاصة بك عادةً في `~/anaconda3/envs/main/lib/python3.7/site-packages/`, فسيقوم Python أيضًا بالبحث في المجلد الذي قمت باستنساخه: `~/transformers/`.
+
+<Tip warning={true}>
+
+يجب عليك الاحتفاظ بمجلد `transformers` إذا كنت تريد الاستمرار في استخدام المكتبة.
+
+</Tip>
+
+الآن يمكنك تحديث المستنسخ الخاص بك بسهولة إلى أحدث إصدار من 🤗 Transformers باستخدام الأمر التالي:
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+ستجد بيئة Python الإصدار `main` من 🤗 Transformers في المرة التالية التي تقوم فيها بتشغيله.
+
+## التثبيت باستخدام conda
+
+قم بالتثبيت من قناة conda `conda-forge`:
+
+```bash
+conda install conda-forge::transformers
+```
+
+## إعداد ذاكرة التخزين المؤقت
+
+تُحمّل النماذج المُسبقة التدريب وتُخزّن مؤقتًا في: `~/.cache/huggingface/hub`. هذا هو المجلد الافتراضي الذي يُحدده متغير البيئة `TRANSFORMERS_CACHE`. على Windows، يكون دليل ذاكرة التخزين المؤقت الافتراضي هو `C:\Users\username\.cache\huggingface\hub`. يمكنك تغيير متغيرات البيئة shell الموضحة أدناه - حسب الأولوية - لتحديد دليل ذاكرة تخزين مؤقت مختلف:
+
+1. متغير البيئة (افتراضي): `HUGGINGFACE_HUB_CACHE` أو `TRANSFORMERS_CACHE`.
+2. متغير البيئة: `HF_HOME`.
+3. متغير البيئة: `XDG_CACHE_HOME` + `/huggingface`.
+
+<Tip>
+
+سيستخدم 🤗 Transformers متغيرات البيئة `PYTORCH_TRANSFORMERS_CACHE` أو `PYTORCH_PRETRAINED_BERT_CACHE` إذا كنت قادمًا من إصدار سابق من هذه المكتبة وقمت بتعيين متغيرات البيئة هذه، ما لم تحدد متغير البيئة `TRANSFORMERS_CACHE`.
+
+</Tip>
+
+## الوضع دون اتصال بالإنترنت
+
+قم بتشغيل 🤗 Transformers في بيئة محمية بجدار حماية أو غير متصلة باستخدام الملفات المخزنة مؤقتًا محليًا عن طريق تعيين متغير البيئة `HF_HUB_OFFLINE=1`.
+
+<Tip>
+
+أضف [🤗 Datasets](https://huggingface.co/docs/datasets/) إلى سير عمل التدريب غير المتصل باستخدام متغير البيئة `HF_DATASETS_OFFLINE=1`.
+
+</Tip>
+
+```bash
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+يجب أن يعمل هذا البرنامج النصي دون توقف أو انتظار انتهاء المهلة الزمنية لأنه لن يحاول تنزيل النموذج من Hub.
+
+يمكنك أيضًا تجاوز تحميل نموذج من Hub من كل استدعاء [`~PreTrainedModel.from_pretrained`] باستخدام معلمة [`local_files_only`]. عندما يتم تعيينها على `True`، يتم تحميل الملفات المحلية فقط:
+
+```py
+from transformers import T5Model
+
+model = T5Model.from_pretrained("./path/to/local/directory", local_files_only=True)
+```
+
+### جلب النماذج والمُجزّئات لاستخدامها دون اتصال بالإنترنت
+
+خيار آخر لاستخدام 🤗 Transformers دون اتصال هو تنزيل الملفات مسبقًا، ثم الإشارة إلى مسارها المحلي عند الحاجة إلى استخدامها دون اتصال. هناك ثلاث طرق للقيام بذلك:
+
+* قم بتنزيل ملف عبر واجهة المستخدم على [Model Hub](https://huggingface.co/models) بالنقر فوق أيقونة ↓.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+* استخدم سير عمل [`PreTrainedModel.from_pretrained`] و [`PreTrainedModel.save_pretrained`]:
+
+    1. قم بتنزيل ملفاتك مسبقًا باستخدام [`PreTrainedModel.from_pretrained`]:
+* استخدم سير عمل [`PreTrainedModel.from_pretrained`] و [`PreTrainedModel.save_pretrained`]:
+
+    1. قم بتنزيل ملفاتك مسبقًا باستخدام [`PreTrainedModel.from_pretrained`]:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. احفظ ملفاتك إلى دليل محدد باستخدام [`PreTrainedModel.save_pretrained`]:
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. الآن عندما تكون غير متصل بالإنترنت، أعد تحميل ملفاتك باستخدام [`PreTrainedModel.from_pretrained`] من الدليل المحدد:
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* قم بتنزيل الملفات برمجيًا باستخدام مكتبة [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub):
+
+    1. قم بتثبيت مكتبة `huggingface_hub` في بيئتك الافتراضية:
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. استخدم وظيفة [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) لتنزيل ملف إلى مسار محدد. على سبيل المثال، يقوم الأمر التالي بتنزيل ملف `config.json` من نموذج [T0](https://huggingface.co/bigscience/T0_3B) إلى المسار المطلوب:
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+بمجرد تنزيل ملفك وتخزينه مؤقتًا محليًا، حدد مساره المحلي الخاص به لتحميله واستخدامه:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+راجع قسم [كيفية تنزيل الملفات من Hub](https://huggingface.co/docs/hub/how-to-downstream) لمزيد من التفاصيل حول تنزيل الملفات المخزنة على Hub.
+
+</Tip>
diff --git a/docs/source/ar/llm_tutorial.md b/docs/source/ar/llm_tutorial.md
new file mode 100644
index 000000000000..264797a982b9
--- /dev/null
+++ b/docs/source/ar/llm_tutorial.md
@@ -0,0 +1,248 @@
+# التوليد باستخدام نماذج اللغات الكبيرة (LLMs)
+
+[[open-in-colab]]
+
+تعد LLMs، أو نماذج اللغة الكبيرة، المكون الرئيسي وراء توليد النصوص. وباختصار، تتكون من نماذج محول كبيرة مسبقة التدريب تم تدريبها للتنبؤ بالكلمة التالية (أو، بشكل أكثر دقة، الرمز اللغوي) بالنظر إلى نص معين. نظرًا لأنها تتنبأ برمز واحد في كل مرة، يجب عليك القيام بشيء أكثر تعقيدًا لتوليد جمل جديدة بخلاف مجرد استدعاء النموذج - يجب عليك إجراء التوليد التلقائي.
+
+التوليد التلقائي هو إجراء وقت الاستدلال الذي يتضمن استدعاء النموذج بشكل متكرر باستخدام مخرجاته الخاصة، بالنظر إلى بعض المدخلات الأولية. في 🤗 Transformers، يتم التعامل مع هذا بواسطة دالة [`~generation.GenerationMixin.generate`]، والتي تتوفر لجميع النماذج ذات القدرات التوليدية.
+
+سيوضح هذا البرنامج التعليمي كيفية:
+
+* تتوليد نص باستخدام نموذج اللغات الكبيرة (LLM)
+* تجنب الوقوع في الأخطاء الشائعة
+* الخطوات التالية لمساعدتك في الاستفادة القصوى من LLM الخاص بك
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers bitsandbytes>=0.39.0 -q
+```
+
+## توليد النص
+
+يأخذ نموذج اللغة المدرب لـ [نمذجة اللغة السببية](tasks/language_modeling) يأخذ تسلسلًا من رموز نصية كمدخل ويعيد توزيع الاحتمالية للرمز التالي.
+
+<!-- [GIF 1 -- FWD PASS] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov"
+    ></video>
+    <figcaption>"التنبؤ بالكلمة التالية لنموذج اللغة (LLM)"</figcaption>
+</figure>
+
+هناك جانب بالغ الأهمية في التوليد التلقائي باستخدام LLMs وهو كيفية اختيار الرمز التالي من توزيع الاحتمالية هذا. كل شيء مسموح به في هذه الخطوة طالما أنك تنتهي برمز للتكرار التالي. وهذا يعني أنه يمكن أن يكون بسيطًا مثل اختيار الرمز الأكثر احتمالًا من توزيع الاحتمالية أو معقدًا مثل تطبيق عشرات التحولات قبل أخذ العينات من التوزيع الناتج.
+
+<!-- [GIF 2 -- TEXT GENERATION] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_2_1080p.mov"
+    ></video>
+    <figcaption>"التوليد التلقائي المتسلسل"</figcaption>
+</figure>
+
+تتكرر العملية الموضحة أعلاه بشكل تكراري حتى يتم الوصول إلى شرط التوقف. في الوضع المثالي، يحدد النموذج شرط التوقف، والذي يجب أن يتعلم عند إخراج رمز نهاية التسلسل (`EOS`). إذا لم يكن الأمر كذلك، يتوقف التوليد عند الوصول إلى طول أقصى محدد مسبقًا.
+
+من الضروري إعداد خطوة اختيار الرمز وشرط التوقف بشكل صحيح لجعل نموذجك يتصرف كما تتوقع في مهمتك. ولهذا السبب لدينا [`~generation.GenerationConfig`] ملف مرتبط بكل نموذج، والذي يحتوي على معلمة توليدية افتراضية جيدة ويتم تحميله جنبًا إلى جنب مع نموذجك.
+
+دعنا نتحدث عن الكود!
+
+
+<Tip>
+
+إذا كنت مهتمًا بالاستخدام الأساسي لـ LLM، فإن واجهة [`Pipeline`](pipeline_tutorial) عالية المستوى هي نقطة انطلاق رائعة. ومع ذلك، غالبًا ما تتطلب LLMs ميزات متقدمة مثل التكميم والتحكم الدقيق في خطوة اختيار الرمز، والتي يتم تنفيذها بشكل أفضل من خلال [`~generation.GenerationMixin.generate`]. التوليد التلقائي باستخدام LLMs  يستهلك الكثير من المواردد ويجب تنفيذه على وحدة معالجة الرسومات للحصول على أداء كافٍ.
+
+</Tip>
+
+أولاً، تحتاج إلى تحميل النموذج.
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
+... )
+```
+
+ستلاحظ وجود معاملين في الاستدعاء `from_pretrained`:
+
+ - `device_map` يضمن انتقال النموذج إلى وحدة معالجة الرسومات (GPU) الخاصة بك
+ - `load_in_4bit` يطبق [4-bit dynamic quantization](main_classes/quantization) لخفض متطلبات الموارد بشكل كبير
+
+هناك طرق أخرى لتهيئة نموذج، ولكن هذا خط أساس جيد للبدء باستخدام LLM.
+
+بعد ذلك، تحتاج إلى معالجة إدخال النص الخاص بك باستخدام [مُجزّئ اللغوي](tokenizer_summary).
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
+>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+```
+
+يحتوي متغير `model_inputs` على النص المدخل بعد تقسيمه إلى وحدات لغوية (tokens)، بالإضافة إلى قناع الانتباه. في حين أن [`~generation.GenerationMixin.generate`] تبذل قصارى جهدها لاستنتاج قناع الانتباه عندما لا يتم تمريره، نوصي بتمريره كلما أمكن ذلك للحصول على نتائج مثالية.
+
+بعد تقسيم المدخلات إلى وحدات لغوية، يمكنك استدعاء الدالة [`~generation.GenerationMixin.generate`] لإرجاع الوحدات اللغوية الناتجة. يجب بعد ذلك تحويل الوحدات المولدة إلى نص قبل طباعته.
+
+```py
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A list of colors: red, blue, green, yellow, orange, purple, pink,'
+```
+
+أخيرًا، ليس عليك معالجة المتتاليات الواحدة تلو الأخرى! يمكنك معالجة مجموعة من المدخلات دفعة واحدة، والتي ستعمل على تحسين الإنتاجية بشكل كبير بتكلفة صغيرة في زمن الاستجابة واستهلاك الذاكر. كل ما عليك التأكد منه هو  تعبئة المدخلات بشكل صحيح (المزيد حول ذلك أدناه).
+
+```py
+>>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
+>>> model_inputs = tokenizer(
+...     ["A list of colors: red, blue", "Portugal is"], return_tensors="pt", padding=True
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+['A list of colors: red, blue, green, yellow, orange, purple, pink,',
+'Portugal is a country in southwestern Europe, on the Iber']
+```
+
+وهذا كل شيء! في بضع سطور من التعليمات البرمجية، يمكنك تسخير قوة LLM.
+
+## الأخطاء الشائعة
+
+هناك العديد من [استراتيجيات التوليد](generation_strategies)، وفي بعض الأحيان قد لا تكون القيم الافتراضية مناسبة لحالتك الاستخدام. إذا لم تكن الإخراج الخاصة بك متوافقة مع ما تتوقعه، فقد قمنا بإنشاء قائمة بأكثر الأخطاء الشائعة وكيفية تجنبها.
+
+```py
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
+... )
+```
+
+### الإخراج المولد قصير جدًا/طويل جدًا
+
+إذا لم يتم تحديد العدد الأقصى للرموز في [`~generation.GenerationConfig`] الملف، `generate` يعيد ما يصل إلى 20 رمزًا بشكل افتراضي. نوصي بشدة بتعيين `max_new_tokens` يدويًا في مكالمة `generate` للتحكم في العدد الأقصى من الرموز الجديدة التي يمكن أن يعيدها. ضع في اعتبارك أن LLMs (بشكل أكثر دقة، [نماذج فك التشفير فقط](https://huggingface.co/learn/nlp-course/chapter1/6؟fw=pt)) تعيد أيضًا المدخلات الأصلية كجزء من الناتج.
+```py
+>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
+
+>>> # By default, the output will contain up to 20 tokens
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5'
+
+>>> # Setting `max_new_tokens` allows you to control the maximum length
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=50)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
+```
+
+### وضع التوليد الافتراضي
+
+بشكل افتراضي، وما لم يتم تحديده في [`~generation.GenerationConfig`] الملف، `generate` يحدد الكلمة الأكثر احتمالًا فى كل خطوة من خطوات عملية التوليد (وهذا يُعرف بالتشفير الجشع). اعتمادًا على مهمتك، قد يكون هذا غير مرغوب فيه؛ تستفيد المهام الإبداعية مثل برامج الدردشة أو كتابة مقال ستفيد من أسلوب العينة العشوائية في اختيار الكلمات، تمن ناحية أخرى، فإن المهام التي تعتمد على مدخلات محددة  مثل تحويل الصوت إلى نص أو الترجم من فك التشفير الجشع. قم بتفعيل أسلوب العينات العشوائية باستخدام `do_sample=True`، ويمكنك معرفة المزيد حول هذا الموضوع في [تدوينة المدونة](https://huggingface.co/blog/how-to-generate).
+
+```py
+>>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
+>>> from transformers import set_seed
+>>> set_seed(42)
+
+>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
+
+>>> # LLM + greedy decoding = repetitive, boring output
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat. I am a cat. I am a cat. I am a cat'
+
+>>> # With sampling, the output becomes more creative!
+>>> generated_ids = model.generate(**model_inputs, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat.  Specifically, I am an indoor-only cat.  I'
+```
+
+### مشكلة حشو المدخلات فى الاتجاة الخطأ
+
+LLMs هي [معماريات فك التشفير فقط](https://huggingface.co/learn/nlp-course/chapter1/6؟fw=pt)، مما يعني أنها تستمر في التكرار على موجه الإدخال الخاص بك. فإن جميع المدخلات يجب أن تكون بنفس الطول. لحل هذه المسألة، يتم إضافة رموز حشو إلى المدخلات الأقصر. نظرًا لأن LLMs  لا تولي اهتمامًا لرموز الحشو هذه، ذلك، يجب تحديد الجزء المهم من المدخل الذي يجب أن يركز عليه النموذج، وهذا يتم عن طريق ما يسمى بـ "قناع الانتباه". يجب أن يكون الحشو في بداية المدخل (الحشو من اليسار)، وليس في نهايته.
+
+```py
+>>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
+>>> # which is shorter, has padding on the right side. Generation fails to capture the logic.
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'1, 2, 33333333333'
+
+>>> # With left-padding, it works as expected!
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'1, 2, 3, 4, 5, 6,'
+```
+
+### موجه غير صحيح
+
+تتوقع بعض نماذج اللغات الكبيرة على صيغة محددة للمدخلات  للعمل بشكل صحيح. إذا لم يتم اتباع هذه الصيغة، فإن أداء النموذج يتأثر سلبًا: لكن هذا التدهور قد لا يكون واضحًا للعيان. تتوفر معلومات إضافية حول التوجيه، بما في ذلك النماذج والمهام التي تحتاج إلى توخي الحذر، في [الدليل](tasks/prompting). دعنا نرى مثالاً باستخدام LLM للدردشة، والذي يستخدم [قالب الدردشة](chat_templating):
+```python
+>>> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "HuggingFaceH4/zephyr-7b-alpha", device_map="auto", load_in_4bit=True
+... )
+>>> set_seed(0)
+>>> prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
+>>> input_length = model_inputs.input_ids.shape[1]
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=20)
+>>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
+"I'm not a thug, but i can tell you that a human cannot eat"
+>>> # Oh no, it did not follow our instruction to reply as a thug! Let's see what happens when we write
+>>> # a better prompt and use the right template for this model (through `tokenizer.apply_chat_template`)
+
+>>> set_seed(0)
+>>> messages = [
+...     {
+...         "role": "system",
+...         "content": "You are a friendly chatbot who always responds in the style of a thug",
+...     },
+...     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+... ]
+>>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
+>>> input_length = model_inputs.shape[1]
+>>> generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20)
+>>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
+'None, you thug. How bout you try to focus on more useful questions?'
+>>> # As we can see, it followed a proper thug style 😎
+```
+
+## موارد إضافية
+
+في حين أن عملية التوليد التلقائي بسيطة نسبيًا، فإن الاستفادة القصوى من LLM الخاص بك يمكن أن تكون مهمة صعبة لأن هناك العديد من الأجزاء المتحركة. للخطوات التالية لمساعدتك في الغوص بشكل أعمق في استخدام LLM وفهمه:
+
+### استخدامات متقدمة للتوليد في نماذج اللغات الكبيرة
+
+1. دليل حول كيفية [التحكم في طرق التوليد المختلفة](generation_strategies)، وكيفية إعداد ملف تكوين التوليد، وكيفية بث الناتج؛
+2. [تسريع توليد النص](llm_optims)؛
+3.[قوالب موجهات للدردشة LLMs](chat_
+4. [دليل تصميم الموجه](tasks/prompting);
+5. مرجع واجهة برمجة التطبيقات (API)  [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], و  [generate-related classes](internal/generation_utils). والعديد من الفئات الأخرى المرتبطة بعملية التوليد.!
+
+### لوحات صدارة نماذج اللغات الكبيرة
+1. لوحة صدارة نماذج اللغات الكبيرة المفتوحة المصدر (Open LLM Leaderboard): تركز على جودة النماذج مفتوحة المصدر [رابط لوحة الصدارة](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
+2. لوحة صدارة أداء نماذج اللغات الكبيرة المفتوحة المصدر (Open LLM-Perf Leaderboard): تركز على إنتاجية نماذج اللغات الكبيرة [رابط لوحة الصدارة](https://huggingface.co/spaces/optimum/llm-perf-leaderboard).
+
+### زمن الاستجابة والإنتاجية واستهلاك الذاكرة
+1. دليل تحسين نماذج اللغات الكبيرة من حيث السرعة والذاكرة: دليل تحسين نماذج اللغات الكبيرة.
+2. التكميم (Quantization): دليل حول تقنية التكميم التكميم مثل تقنيتي bitsandbytes و autogptq، والتي توضح كيفية تقليل متطلبات الذاكرة بشكل كبير.
+
+### مكتبات مرتبطة
+1. [`optimum`](https://github.com/huggingface/optimum), امتداد لمكتبة Transformers يعمل على تحسين الأداء لأجهزة معينة.
+2. [`outlines`](https://github.com/outlines-dev/outlines), مكتبة للتحكم في توليد النصوص (على سبيل المثال، لتوليد ملفات JSON).
+3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), مكتبة للتوليد الموجه بقواعد اللغة الخالية من السياق (على سبيل المثال، JSON، SQL، Python).
+4. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), خادم جاهز للإنتاج لنماذج اللغات الكبيرة.
+5. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), واجهة مستخدم لتوليد النصوص.   
diff --git a/docs/source/ar/model_sharing.md b/docs/source/ar/model_sharing.md
new file mode 100644
index 000000000000..620261a0c58a
--- /dev/null
+++ b/docs/source/ar/model_sharing.md
@@ -0,0 +1,223 @@
+# شارك نموذجك مع العالم
+
+أظهرت آخر درسين تعليميين كيفية ضبط نموذج بدقة باستخدام PyTorch و Keras و 🤗 Accelerate لعمليات التهيئة الموزعة. والخطوة التالية هي مشاركة نموذجك مع المجتمع! في Hugging Face، نؤمن بالمشاركة المفتوحة للمعرفة والموارد لتمكين الجميع من الاستفادة من الذكاء الاصطناعي. ونشجعك على مشاركة نموذجك مع المجتمع لمساعدة الآخرين على توفير الوقت والموارد.
+
+في هذا الدرس، ستتعلم طريقتين لمشاركة نموذجك المدرب أو مضبوط على منصة [Model Hub](https://huggingface.co/models):
+
+- رفع ملفاتك إلى منصة Hub مباشرة باستخدام الكود البرمجي.
+
+- قم بسحب وإفلات ملفاتك إلى Hub باستخدام الواجهة web.
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="مشغل فيديو YouTube"
+frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+picture-in-picture" allowfullscreen></iframe>
+
+<Tip>
+
+لمشاركة نموذج مع المجتمع، تحتاج إلى حساب على [huggingface.co](https://huggingface.co/join). يمكنك أيضًا الانضمام إلى منظمة موجودة أو إنشاء منظمة جديدة.
+
+</Tip>
+
+## ميزات المستودع
+
+يعمل كل مستودع على Model Hub مثل مستودع GitHub النتقليدي. تقدم مستودعاتنا التحكم في الإصدارات وسجل التغييرات، وقدرة على رؤية الاختلافات بين الإصدارات.
+
+تعتمد آلية التحكم في الإصدارات على منصة Model Hub على نظامي git و [git-lfs](https://git-lfs.github.com/). وبعبارة أخرى، يمكنك التعامل مع  كل نموذج كأنه مستودع مستقل، مما يمكّن من زيادة التحكم في الوصول والقابلية للتطوير. يسمح التحكم في الإصدار بإجراء تعديلات وتثبيت إصدار محدد من النموذج باستخدام رمز التغيير (commit hash) أو وسم (tag) أو فرع (branch).
+
+بفضل هذه الميزة، يمكنك تحميل إصدار محدد من النموذج باستخدام معلمة الإصدار "revision":
+
+```py
+>>> model = AutoModel.from_pretrained(
+...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # اسم العلامة، أو اسم الفرع، أو تجزئة الالتزام
+... )
+```
+
+من السهل أيضًا تعديل الملفات  الموجودة داخل مستودع، ويمكنك عرض سجل التغييرات التي طرأت على هذه الملفات ومعاينة الاختلافات بين الإصدارات المختلفة:
+
+![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
+
+## الإعداد
+
+قبل مشاركة نموذج على Hub، ستحتاج إلى بيانات اعتماد حساب Hugging Face الخاصة بك.  إذا كنت تستخدم منصة الأوامر، فقم بتشغيل الأمر التالي في بيئة افتراضية حيث تم تثبيت 🤗 Transformers. سيقوم هذا الأمر بتخزين رمز الدخول الخاص بك في مجلد تخزين المؤقت لـ Hugging Face (`~/.cache/` بشكل افتراضي):
+
+```bash
+huggingface-cli login
+```
+
+إذا كنت تستخدم دفتر ملاحظات مثل Jupyter أو Colaboratory، فتأكد من تثبيت مكتبة [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). تسمح لك هذه المكتبة بالتفاعل برمجيًا مع Hub.
+
+```bash
+pip install huggingface_hub
+```
+
+ثم استخدم `notebook_login` لتسجيل الدخول إلى Hub، واتبع الرابط [هنا](https://huggingface.co/settings/token) لإنشاء رمز للتسجيل:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+
+## تحويل النموذج ليتوافق مع جميع الأطر العمل
+
+لضمان إمكانية استخدام نموذجك من قبل شخص يعمل بإطار عمل مختلف، نوصي بتحويل نموذجك ورفعه مع نقاط التحقق من PyTorch و TensorFlow. في حين أن المستخدمين لا يزال بإمكانهم تحميل نموذجك من إطار عمل مختلف إذا تخطيت هذه الخطوة، إلا أنه سيكون أبطأ لأن 🤗 Transformers ستحتاج إلى تحويل نقطة التحقق أثناء التشغيل.
+
+تحويل نقطة التحقق لإطار عمل آخر أمر سهل. تأكد من تثبيت PyTorch و TensorFlow (راجع [هنا](installation) لتعليمات التثبيت)، ثم ابحث عن النموذج الملائم لمهمتك في الإطار الآخر.
+
+<frameworkcontent>
+<pt>
+حدد `from_tf=True` لتحويل نقطة تحقق من TensorFlow إلى PyTorch:
+
+```py
+>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+>>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</pt>
+<tf>
+حدد `from_pt=True` لتحويل نقطة تحقق من PyTorch إلى TensorFlow:
+
+```py
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+```
+
+بعد ذلك، يمكنك حفظ نموذج TensorFlow الجديد بنقطة التحقق الجديدة:
+
+```py
+>>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</tf>
+<jax>
+إذا كان النموذج متاحًا في Flax، فيمكنك أيضًا تحويل نقطة تحقق من PyTorch إلى Flax:
+
+```py
+>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
+...     "path/to/awesome-name-you-picked", from_pt=True
+... )
+```
+</jax>
+</frameworkcontent>
+
+## دفع نموذج أثناء التدريب
+
+<frameworkcontent>
+<pt>
+<Youtube id="Z1-XMy-GNLQ"/>
+
+مشاركة نموذجك على Hub مر بسيط للغاية كل ما عليك هو إضافة معلمة أو استدعاء رد إضافي. كما تذكر من درس [التدريب الدقيق](training)، فإن فئة [`TrainingArguments`] هي المكان الذي تحدد فيه المعلمات الفائقة وخيارات التدريب الإضافية. تشمل إحدى خيارات التدريب هذه القدرة على دفع النموذج مباشرة إلى المنصة Hub. قم بتعيين `push_to_hub=True` في [`TrainingArguments`]:
+
+```py
+>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
+```
+
+مرر معامﻻت التدريب كالمعتاد إلى [`Trainer`]:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+بعد ضبط نموذجك بدقة، يمكنك استخدام دالة [`~transformers.Trainer.push_to_hub`] المتاحة في [`Trainer`] لدفع النموذج المدرب إلى المنصة Hub. سوف تضيف 🤗 Transformers تلقائيًا المعلمات الفائقة المستخدمة في التدريب ونتائج التدريب وإصدارات الإطار إلى بطاقة معلومات النموذج الخاصة بك!
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+شارك نموذجًا على Hub باستخدام [`PushToHubCallback`]. في دالة [`PushToHubCallback`], أضف:
+
+- دليل إخراج لنموذجك.
+- مُجزّئ اللغوي.
+- `hub_model_id`، والذي هو اسم مستخدم Hub واسم النموذج الخاص بك.
+
+```py
+>>> from transformers import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
+... )
+```
+
+أضف الاستدعاء إلى [`fit`](https://keras.io/api/models/model_training_apis/)، وسيقوم 🤗 Transformers بدفع النموذج المدرب إلى Hub:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
+```
+</tf>
+</frameworkcontent>
+
+## استخدام دالة `push_to_hub`
+
+يمكنك أيضًا استدعاء `push_to_hub` مباشرة على نموذجك لتحميله إلى Hub.
+
+حدد اسم نموذجك في `push_to_hub`:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-model")
+```
+
+ينشئ هذا مستودعًا تحت اسم المستخدم الخاص بك باسم نموذج `my-awesome-model`. يمكن للمستخدمين الآن تحميل نموذجك باستخدام دالة `from_pretrained`:
+
+```py
+>>> from transformers import AutoModel
+
+>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
+```
+```py
+>>> from transformers import AutoModel
+
+>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
+```
+
+إذا كنت تنتمي إلى منظمة وتريد دفع نموذجك تحت اسم المنظمة بدلاً من ذلك، فما عليك سوى إضافته إلى `repo_id`:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
+```
+
+يمكن أيضًا استخدام دالة `push_to_hub` لإضافة ملفات أخرى إلى مستودع النماذج. على سبيل المثال، أضف رموزًا إلى مستودع نموذج:
+
+```py
+>>> tokenizer.push_to_hub("my-awesome-model")
+```
+
+أو ربما تريد إضافة إصدار TensorFlow من نموذج PyTorch المضبوط:
+
+```py
+>>> tf_model.push_to_hub("my-awesome-model")
+```
+
+الآن عند الانتقال إلى ملفك الشخصي على Hugging Face، يجب أن ترى مستودع النماذج الذي أنشأته حديثًا. سيؤدي النقر فوق علامة التبويب **Files** إلى عرض جميع الملفات التي قمت بتحميلها في المستودع.
+
+للحصول على مزيد من التفاصيل حول كيفية إنشاء الملفات وتحميلها إلى مستودع، راجع وثائق Hub [هنا](https://huggingface.co/docs/hub/how-to-upstream).
+
+## التحميل باستخدام الواجهة web
+
+يمكن للمستخدمين الذين يفضلون نهج عدم الترميز تحميل نموذج من خلال واجهة Hub web. قم بزيارة [huggingface.co/new](https://huggingface.co/new) لإنشاء مستودع جديد:
+
+![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
+
+من هنا، أضف بعض المعلومات حول نموذجك:
+
+- حدد **مالك** المستودع. يمكن أن يكون هذا أنت أو أي من المنظمات التي تنتمي إليها.
+- اختر اسمًا لنموذجك، والذي سيكون أيضًا اسم المستودع.
+- اختر ما إذا كان نموذجك عامًا أم خاصًا.
+- حدد ترخيص الاستخدام لنموذجك.
+
+الآن انقر فوق علامة التبويب **Files** ثم انقر فوق الزر **Add file** لإضافة ملف جديد إلى مستودعك. ثم اسحب وأسقط ملفًا لتحميله وأضف رسالة الالتزام.
+
+![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
+
+## إضافة بطاقة نموذج
+
+للتأكد من فهم المستخدمين لقدرات نموذجك وقيوده وتحيزاته المحتملة واعتباراته الأخلاقية، يرجى إضافة بطاقة نموذج إلى مستودعك. يتم تعريف بطاقة النموذج في ملف `README.md`. يمكنك إضافة بطاقة نموذج عن طريق:
+
+* قم بإنشاء ملف `README.md` وتحميله يدويًا.
+* انقر فوق الزر **Edit model card** في مستودع نموذجك.
+
+الق نظرة على بطاقة [DistilBert](https://huggingface.co/distilbert/distilbert-base-uncased) للحصول على مثال جيد على نوع المعلومات التي يجب أن تتضمنها بطاقة النموذج. للحصول على مزيد من التفاصيل حول الخيارات الأخرى التي يمكنك التحكم فيها في ملف `README.md` مثل البصمة الكربونية للنموذج أو أمثلة الأداة، راجع الوثائق [هنا](https://huggingface.co/docs/hub/models-cards).
\ No newline at end of file
diff --git a/docs/source/ar/peft.md b/docs/source/ar/peft.md
new file mode 100644
index 000000000000..f5f050ade427
--- /dev/null
+++ b/docs/source/ar/peft.md
@@ -0,0 +1,250 @@
+# تحميل المحوّلات باستخدام 🤗 PEFT
+
+[[open-in-colab]]
+
+تقنية "التدريب الدقيق ذو الكفاءة البارامتيرية" (PEFT)](https://huggingface.co/blog/peft) تقوم بتجميد معلمات النموذج المُدرب مسبقًا أثناء الضبط الدقيق وتضيف عدد صغير من المعلمات القابلة للتدريب (المحولات) فوقه. يتم تدريب المحوّلات لتعلم معلومات خاصة بالمهام. وقد ثبت أن هذا النهج فعال للغاية من حيث استخدام الذاكرة مع انخفاض استخدام الكمبيوتر أثناء إنتاج نتائج قمماثلة للنموذج مضبوط دقيقًا بالكامل.
+
+عادة ما تكون المحولات المدربة باستخدام PEFT أصغر بمقدار كبير من حيث الحجم من النموذج الكامل، مما يجعل من السهل مشاركتها وتخزينها وتحميلها.
+
+<div class="flex flex-col justify-center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/PEFT-hub-screenshot.png"/>
+  <figcaption class="text-center">تبلغ أوزان المحول لطراز OPTForCausalLM المخزن على Hub حوالي 6 ميجابايت مقارنة بالحجم الكامل لأوزان النموذج، والتي يمكن أن تكون حوالي 700 ميجابايت.</figcaption>
+</div>
+
+إذا كنت مهتمًا بمعرفة المزيد عن مكتبة 🤗 PEFT، فراجع [الوثائق](https://huggingface.co/docs/peft/index).
+
+## الإعداد
+
+ابدأ بتثبيت 🤗 PEFT:
+
+```bash
+pip install peft
+```
+
+إذا كنت تريد تجربة الميزات الجديدة تمامًا، فقد تكون مهتمًا بتثبيت المكتبة من المصدر:
+
+```bash
+pip install git+https://github.com/huggingface/peft.git
+```
+
+## نماذج PEFT المدعومة
+
+يدعم 🤗 Transformers بشكلٍ أصلي بعض طرق PEFT، مما يعني أنه يمكنك تحميل أوزان المحول المخزنة محليًا أو على Hub وتشغيلها أو تدريبها ببضع سطور من التعليمات البرمجية. الطرق المدعومة هي:
+
+- [محولات الرتبة المنخفضة](https://huggingface.co/docs/peft/conceptual_guides/lora)
+- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3)
+- [AdaLoRA](https://arxiv.org/abs/2303.10512)
+
+إذا كنت تريد استخدام طرق PEFT الأخرى، مثل تعلم المحث أو ضبط المحث، أو حول مكتبة 🤗 PEFT بشكل عام، يرجى الرجوع إلى [الوثائق](https://huggingface.co/docs/peft/index).
+
+## تحميل محول PEFT
+
+لتحميل نموذج محول PEFT واستخدامه من 🤗 Transformers، تأكد من أن مستودع Hub أو الدليل المحلي يحتوي على ملف `adapter_config.json` وأوزان المحوّل، كما هو موضح في صورة المثال أعلاه. بعد ذلك، يمكنك تحميل نموذج محوّل PEFT باستخدام فئة `AutoModelFor`. على سبيل المثال، لتحميل نموذج محول PEFT للنمذجة اللغوية السببية:
+
+1. حدد معرف النموذج  لPEFT
+2. مرره إلى فئة [`AutoModelForCausalLM`]
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id)
+```
+
+<Tip>
+
+يمكنك تحميل محول PEFT باستخدام فئة `AutoModelFor` أو فئة النموذج الأساسي مثل `OPTForCausalLM` أو `LlamaForCausalLM`.
+
+</Tip>
+
+يمكنك أيضًا تحميل محول PEFT عن طريق استدعاء طريقة `load_adapter`:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "facebook/opt-350m"
+peft_model_id = "ybelkada/opt-350m-lora"
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+model.load_adapter(peft_model_id)
+```
+
+راجع قسم [وثائق API](#transformers.integrations.PeftAdapterMixin) أدناه لمزيد من التفاصيل.
+
+## التحميل في 8 بت أو 4 بت
+
+راجع قسم [وثائق API](#transformers.integrations.PeftAdapterMixin) أدناه لمزيد من التفاصيل.
+
+## التحميل في 8 بت أو 4 بت
+
+يدعم تكامل `bitsandbytes` أنواع بيانات الدقة 8 بت و4 بت، والتي تكون مفيدة لتحميل النماذج الكبيرة لأنها توفر  مساحة في الذاكرة (راجع دليل تكامل `bitsandbytes` [guide](./quantization#bitsandbytes-integration) لمعرفة المزيد). أضف المعلمات`load_in_8bit` أو `load_in_4bit` إلى [`~PreTrainedModel.from_pretrained`] وقم بتعيين `device_map="auto"` لتوزيع النموذج بشكل فعال على الأجهزة لديك:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True))
+```
+
+## إضافة محول جديد
+
+يمكنك استخدام الدالة [`~peft.PeftModel.add_adapter`] لإضافة محوّل جديد إلى نموذج يحتوي بالفعل على محوّل آخر طالما أن المحول الجديد  مطابقًا للنوع الحالي. على سبيل المثال، إذا كان لديك محول LoRA موجود مرتبط بنموذج:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import LoraConfig
+
+model_id = "facebook/opt-350m"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+lora_config = LoraConfig(
+    target_modules=["q_proj", "k_proj"],
+    init_lora_weights=False
+)
+
+model.add_adapter(lora_config, adapter_name="adapter_1")
+```
+
+لإضافة محول جديد:
+
+```py
+# قم بتعليق محول جديد بنفس التكوين
+model.add_adapter(lora_config, adapter_name="adapter_2")
+```
+
+الآن يمكنك استخدام [`~peft.PeftModel.set_adapter`] لتعيين المحول الذي سيتم استخدامه:
+
+```py
+# استخدم adapter_1
+model.set_adapter("adapter_1")
+output = model.generate(**inputs)
+print(tokenizer.decode(output_disabled[0], skip_special_tokens=True))
+
+# استخدم adapter_2
+model.set_adapter("adapter_2")
+output_enabled = model.generate(**inputs)
+print(tokenizer.decode(output_enabled[0], skip_special_tokens=True))
+```
+
+## تمكين وتعطيل المحولات
+
+بمجرد إضافة محول إلى نموذج، يمكنك تمكين أو تعطيل وحدة المحول. لتمكين وحدة المحول:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import PeftConfig
+
+model_id = "facebook/opt-350m"
+adapter_model_id = "ybelkada/opt-350m-lora"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text = "Hello"
+inputs = tokenizer(text, return_tensors="pt")
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+peft_config = PeftConfig.from_pretrained(adapter_model_id)
+
+# لبدء تشغيله بأوزان عشوائية
+peft_config.init_lora_weights = False
+
+model.add_adapter(peft_config)
+model.enable_adapters()
+output = model.generate(**inputs)
+```
+
+لإيقاف تشغيل وحدة المحول:
+
+```py
+model.disable_adapters()
+output = model.generate(**inputs)
+```
+
+## تدريب محول PEFT
+
+يدعم محول PEFT فئة [`Trainer`] بحيث يمكنك تدريب محول لحالتك الاستخدام المحددة. فهو يتطلب فقط إضافة بضع سطور أخرى من التعليمات البرمجية. على سبيل المثال، لتدريب محول LoRA:
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج دقيق باستخدام [`Trainer`، فراجع البرنامج التعليمي](training) لضبط نموذج مُدرب مسبقًا.
+
+</Tip>
+
+1. حدد تكوين المحول باستخدام نوع المهمة والمعاملات الزائدة (راجع [`~peft.LoraConfig`] لمزيد من التفاصيل حول وظيفة هذه  المعلمات).
+
+```py
+from peft import LoraConfig
+
+peft_config = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.1,
+    r=64,
+    bias="none",
+    task_type="CAUSAL_LM"،
+)
+```
+
+2. أضف المحول إلى النموذج.
+
+```py
+model.add_adapter(peft_config)
+```
+
+3. الآن يمكنك تمرير النموذج إلى [`Trainer`]!
+
+```py
+trainer = Trainer(model=model, ...)
+trainer.train()
+```
+
+لحفظ محول المدرب وتحميله مرة أخرى:
+
+```py
+model.save_pretrained(save_dir)
+model = AutoModelForCausalLM.from_pretrained(save_dir)
+```
+
+## إضافة طبقات قابلة للتدريب إضافية إلى محول PEFT
+
+```py
+model.save_pretrained(save_dir)
+model = AutoModelForCausalLM.from_pretrained(save_dir)
+```
+
+## إضافة طبقات قابلة للتدريب إضافية إلى محول PEFT
+
+يمكنك أيضًا إجراء تدريب دقيق لمحوّلات قابلة للتدريب إضافية فوق نموذج يحتوي بالفعل على محوّلات عن طريق تمرير معلم `modules_to_save` في تكوين PEFT الخاص بك. على سبيل المثال، إذا كنت تريد أيضًا ضبط دقيق لرأس النموذج اللغوي`lm_head` فوق نموذج بمحوّل LoRA:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import LoraConfig
+
+model_id = "facebook/opt-350m"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+lora_config = LoraConfig(
+    target_modules=["q_proj", "k_proj"],
+    modules_to_save=["lm_head"]،
+)
+
+model.add_adapter(lora_config)
+```
+
+## وثائق API
+
+[[autodoc]] integrations.PeftAdapterMixin
+    - load_adapter
+    - add_adapter
+    - set_adapter
+    - disable_adapters
+    - enable_adapters
+    - active_adapters
+    - get_adapter_state_dict
+
+
+
+
+<!--
+TODO: (@younesbelkada @stevhliu)
+-   Link to PEFT docs for further details
+-   Trainer
+-   8-bit / 4-bit examples ?
+-->
\ No newline at end of file
diff --git a/docs/source/ar/pipeline_tutorial.md b/docs/source/ar/pipeline_tutorial.md
new file mode 100644
index 000000000000..2dd713a6533f
--- /dev/null
+++ b/docs/source/ar/pipeline_tutorial.md
@@ -0,0 +1,315 @@
+# خطوط الأنابيب الاستدلال
+
+يجعل [`pipeline`] من السهل استخدام أي نموذج من [Hub](https://huggingface.co/models) للاستدلال لأي مهام خاصة باللغة أو الرؤية الحاسوبية أو الكلام أو المهام متعددة الوسائط. حتى إذا لم يكن لديك خبرة في طريقة معينة أو لم تكن على دراية بالرمز الأساسي وراء النماذج، يمكنك مع ذلك استخدامها للاستدلال باستخدام [`pipeline`]! سوف يُعلمك هذا البرنامج التعليمي ما يلي:
+
+* استخدام [`pipeline`] للاستدلال.
+* استخدم مُجزّئ أو نموذجًا محددًا.
+* استخدم [`pipeline`] للمهام الصوتية والبصرية والمتعددة الوسائط.
+
+<Tip>
+
+اطلع على وثائق [`pipeline`] للحصول على القائمة كاملة بالمهام المدعومة والمعلمات المتاحة.
+
+</Tip>
+
+## استخدام الأنابيب
+
+على الرغم من أن لكل مهمة أنبوب [`pipeline`] خاص بها، إلا أنه من الأبسط استخدام تجريد خط الأنابيب العام [`pipeline`] الذي يحتوي على جميع خطوط الأنابيب الخاصة بالمهمة. يقوم [`pipeline`] تلقائيًا بتحميل نموذج افتراضي وفئة معالجة مسبقة قادرة على الاستدلال لمهمتك. دعنا نأخذ مثال استخدام [`pipeline`] للتعرف التلقائي على الكلام (ASR)، أو تحويل الكلام إلى نص.
+
+1. ابدأ بإنشاء [`pipeline`] وحدد مهمة الاستدلال:
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition")
+```
+
+2. مرر إدخالك إلى [`pipeline`]. في حالة التعرف على الكلام، يكون هذا ملف إدخال صوتي:
+
+```py
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
+```
+
+لم تحصل على النتيجة التي تريدها؟ تحقق من بعض [نماذج التعرف على الكلام الأكثر تنزيلًا](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) 
+على Hub لمعرفة ما إذا كان بإمكانك الحصول على نسخة منقحة أفضل.
+
+لنَجرب نموذج [Whisper large-v2](https://huggingface.co/openai/whisper-large) من OpenAI. تم إصدار Whisper بعد عامين من إصدار Wav2Vec2، وتم تدريبه على ما يقرب من 10 أضعاف كمية البيانات. وبهذه الصفة، فإنه يتفوق على Wav2Vec2 في معظم معظم المقاييس. كما أنه يمتلك ميزة إضافية وهي في التنبؤ بعلامات الترقيم وحالة الأحرف، والتي لا يمكن تحقيقها مع Wav2Vec2.
+
+دعونا نجربها هنا لنرى كيف تؤدي:
+
+```py
+>>> transcriber = pipeline(model="openai/whisper-large-v2")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+الآن تبدو هذه النتيجة أكثر دقة! لمقارنة عميقة حول Wav2Vec2 مقابل Whisper، راجع [دورة Audio Transformers](https://huggingface.co/learn/audio-course/chapter5/asr_models).
+نشجعك بشدة على التحقق من Hub للحصول على نماذج بلغات مختلفة، ونماذج متخصصة في مجالك، وأكثر من ذلك.
+يمكنك التحقق من نتائج النموذج ومقارنتها مباشرة من متصفحك على Hub لمعرفة ما إذا كان يناسبها
+أو التعامل مع الحالات الخاصة بشكل أفضل من غيرها.
+وإذا لم تجد نموذجًا لحالتك الاستخدام، فيمكنك دائمًا البدء في [التدريب](training) الخاص بك!
+
+إذا كان لديك عدة مدخلات، فيمكنك تمرير إدخالك كقائمة:
+
+```py
+transcriber(
+    [
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
+    ]
+)
+```
+
+تعد خطوط الأنابيب مثالية للتجريب نظرًا لأن التبديل من نموذج إلى آخر أمر بسيط للغاية؛ ومع ذلك، هناك بعض الطرق لتحسينها لأحمال عمل أكبر من التجريب. راجع الأدلة التالية التي تتعمق فى التكرار عبر مجموعات البيانات الكاملة أو استخدام خطوط الأنابيب في خادم ويب:
+من الوثائق:
+* [استخدام خطوط الأنابيب على مجموعة بيانات](#using-pipelines-on-a-dataset)
+* [استخدام خطوط الأنابيب لخادم ويب](./pipeline_webserver)
+
+## المعلمات
+
+يدعم [`pipeline`] العديد من المعلمات؛ بعضها خاص بالمهمة، والبعض الآخر عام لجميع خطوط الأنابيب.
+بشكل عام، يمكنك تحديد المعلمات في أي مكان تريده:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1)
+
+out = transcriber(...)  # سيتم استخدام هذا `my_parameter=1`.
+out = transcriber(..., my_parameter=2)  # سيتم تجاوز هذا واستخدام `my_parameter=2`.
+out = transcriber(...)  # سيتم الرجوع إلى استخدام `my_parameter=1`.
+```
+
+دعونا نلقي نظرة على 3 مهمة:
+
+### الجهاز
+
+إذا كنت تستخدم `device=n`، فإن خط الأنابيب يضع النموذج تلقائيًا على الجهاز المحدد.
+سيعمل هذا بغض النظر عما إذا كنت تستخدم PyTorch أو Tensorflow.
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device=0)
+```
+
+إذا كان النموذج كبيرًا جدًا بالنسبة لوحدة معالجة الرسومات (GPU) واحدة، وأنت تستخدم PyTorch، فيمكنك تعيين `torch_dtype='float16'` لتمكين الاستدلال بدقة FP16. عادةً ما لا يتسبب ذلك في حدوث انخفاضات كبيرة في الأداء، ولكن تأكد من تقييمه على نماذجك!
+
+بدلاً من ذلك، يمكنك تعيين `device_map="auto"` لتحديد كيفية تحميل مخزنات النموذج وتخزينها تلقائيًا. يتطلب استخدام معامل `device_map` مكتبه 🤗 [Accelerate](https://huggingface.co/docs/accelerate):
+
+```bash
+pip install --upgrade accelerate
+```
+
+تقوم الشفرة التالية بتحميل مخزنات النموذج وتخزينها تلقائيًا عبر الأجهزة:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")
+```
+
+لاحظ أنه إذا تم تمرير `device_map="auto"`، فلا توجد حاجة لإضافة حجة `device=device` عند إنشاء خط الأنابيب الخاص بك، فقد تواجه بعض السلوكيات غير المتوقعة!
+
+### حجم الدفعة
+
+بشكل افتراضي، لن تقوم خطوط الأنابيب بتجميع الاستدلال لأسباب مفصلة [هنا](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). والسبب هو أن التجميع ليست أسرع بالضرورة، ويمكن أن تكون أبطأ في الواقع في بعض الحالات.
+
+ولكن إذا نجحت في حالتك الاستخدام، فيمكنك استخدام ما يلي:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
+audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
+texts = transcriber(audio_filenames)
+```
+
+هذا يشغل خط الأنابيب على ملفات الصوت الأربعة المتاحة، ولكنه سيمررها على دفعتين
+إلى النموذج (الذي يوجد على وحدة معالجة الرسومات (GPU)، حيث من المرجح أن تساعد التجميع) دون الحاجة إلى أي رمز إضافي منك. 
+يجب أن تتطابق الإخراج دائمًا مع ما كنت ستحصل عليه دون التجميع. المقصود منه فقط كطريقة لمساعدتك في الحصول على سرعة أكبر من خط الأنابيب.
+
+يمكن لخطوط الأنابيب أيضًا تخفيف بعض تعقيدات التجميع لأنه، بالنسبة لبعض خطوط الأنابيب، يجب تقسيم عنصر واحد (مثل ملف صوتي طويل) إلى أجزاء متعددة لمعالجته بواسطة نموذج. يقوم خط الأنابيب بأداء هذه  العملية التي تسمى تجميع الأجزاء [*batch batching*](./main_classes/pipelines#pipeline-chunk-batching) نيابة عنك.
+
+### معلمات خاصة بالمهمة
+
+توفر جميع المهام معلمات خاصة بالمهمة تتيح المرونة والخيارات الإضافية لمساعدتك في أداء عملك.
+على سبيل المثال، تحتوي طريقة [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] على معلمة `return_timestamps` التي تبدو واعدة لترجمة مقاطع الفيديو:
+```py
+>>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True)
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]}
+```
+
+كما ترون، استنتج النموذج النص.وكذلك حدد **وقت** نطق الجمل المختلفة.
+
+تتوفر العديد من المعلمات لكل مهمة، لذا تحقق من مرجع API لكل مهمة لمعرفة ما يمكنك تعديله!
+على سبيل المثال، تحتوي [`~transformers.AutomaticSpeechRecognitionPipeline`] على معلمة `chunk_length_s` مفيدة 
+للعمل على ملفات الصوت الطويلة جدًا (على سبيل المثال، ترجمة الأفلام أو مقاطع الفيديو التي تستغرق ساعة) والتي لا يمكن للنموذج التعامل معها بمفرده:
+
+```python
+>>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30)
+>>> transcriber("https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/ted_60.wav")
+{'text': " So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know. You get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option, it was way too big a project. So I planned things out and I decided I kind of had to go something like this. This is how the year would go. So I'd start off light and I'd bump it up"}
+```
+
+إذا لم تتمكن من العثور على معلمة قد تساعدك حقًا، فلا تتردد في [طلبها](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)!
+
+
+## استخدام خطوط الأنابيب على مجموعة بيانات
+
+يمكن أيضًا تشغيل خط الأنابيب للاستدلال على مجموعة بيانات كبيرة. أسهل طريقة نوصي بها للقيام بذلك هي باستخدام المتكرر (iterator).:
+
+```py
+def data():
+    for i in range(1000):
+        yield f"My example {i}"
+
+
+pipe = pipeline(model="openai-community/gpt2", device=0)
+generated_characters = 0
+for out in pipe(data()):
+    generated_characters += len(out[0]["generated_text"])
+```
+
+يقوم المؤشر `data()` بإرجاع كل نتيجة، ويتعرف خط الأنابيب تلقائيًا
+المدخل قابل للتحديد ويبدأ في جلب البيانات أثناء
+يستمر في معالجتها على وحدة معالجة الرسومات (GPU) (يستخدم هذا [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) تحت الغطاء).
+هذا أمر مهم لأنك لا تحتاج إلى تخصيص ذاكرة لمجموعة البيانات بأكملها
+ويمكنك تغذية وحدة معالجة الرسومات (GPU) بأسرع ما يمكن.
+
+نظرًا لأن التجميع قد تسرع الأمور، فقد يكون من المفيد ضبط معلمة `batch_size` هنا.
+
+أبسط طريقة للتنقل خلال مجموعة بيانات هي فقط تحميل واحدة من 🤗 [Datasets](https://github.com/huggingface/datasets/):
+
+```py
+# KeyDataset هي أداة مساعدة ستقوم فقط بإخراج العنصر الذي نهتم به.
+from transformers.pipelines.pt_utils import KeyDataset
+from datasets import load_dataset
+
+pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
+dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
+
+for out in pipe(KeyDataset(dataset, "audio")):
+    print(out)
+```
+
+## استخدام خطوط الأنابيب لخادم ويب
+
+<Tip>
+إن إنشاء محرك استدلال هو موضوع معقد يستحق صفحته الخاصة.
+</Tip>
+
+[Link](./pipeline_webserver)
+
+## خط أنابيب الرؤية
+
+إن استخدام [`pipeline`] لمهام الرؤية مماثل تمامًا.
+
+حدد مهمتك ومرر صورتك إلى المصنف. يمكن أن تكون الصورة رابطًا أو مسارًا محليًا أو صورة مشفرة بتنسيق base64. على سبيل المثال، ما نوع القطط الموضح أدناه؟
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(model="google/vit-base-patch16-224")
+>>> preds = vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
+```
+
+## خط أنابيب النص
+
+إن استخدام [`pipeline`] لمهام NLP مماثل تمامًا.
+
+```py
+>>> from transformers import pipeline
+
+>>> # هذا النموذج هو نموذج "zero-shot-classification".
+>>> # سيصنف النص، ولكن يمكنك اختيار أي تسمية قد تتخيلها
+>>> classifier = pipeline(model="facebook/bart-large-mnli")
+>>> classifier(
+...     "I have a problem with my iphone that needs to be resolved asap!!",
+...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+... )
+{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+```
+
+## خط أنابيب متعدد الوسائط
+
+تدعم [`pipeline`] أكثر من طريقة واحدة. على سبيل المثال، تجمع مهمة الإجابة على الأسئلة المرئية (VQA) بين النص والصورة. لا تتردد في استخدام أي رابط صورة تريده وسؤال تريد طرحه حول الصورة. يمكن أن تكون الصورة عنوان URL أو مسارًا محليًا للصورة.
+
+على سبيل المثال، إذا كنت تستخدم هذه [صورة الفاتورة](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png):
+
+```py
+>>> from transformers import pipeline
+
+>>> vqa = pipeline(model="impira/layoutlm-document-qa")
+>>> output = vqa(
+...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+...     question="What is the invoice number?",
+... )
+>>> output[0]["score"] = round(output[0]["score"], 3)
+>>> output
+[{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
+```
+
+<Tip>
+
+لتشغيل المثال أعلاه، تحتاج إلى تثبيت [`pytesseract`](https://pypi.org/project/pytesseract/) بالإضافة إلى 🤗 Transformers:
+
+```bash
+sudo apt install -y tesseract-ocr
+pip install pytesseract
+```
+
+</Tip>
+
+## استخدام `pipeline` على نماذج كبيرة مع 🤗 `accelerate`:
+
+يمكنك بسهولة تشغيل `pipeline` على نماذج كبيرة باستخدام 🤗 `accelerate`! أولاً، تأكد من تثبيت `accelerate` باستخدام `pip install accelerate`.
+
+قم أولاً بتحميل نموذجك باستخدام `device_map="auto"`! سنستخدم `facebook/opt-1.3b` كمثال لنا.
+
+```py
+# pip install accelerate
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+يمكنك أيضًا تمرير نماذج محملة بـ 8 بت إذا قمت بتثبيت `bitsandbytes` وإضافة الحجة `load_in_8bit=True`
+
+```py
+# pip install accelerate bitsandbytes
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True})
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+لاحظ أنه يمكنك استبدال نقطة التفتيش بأي نموذج من Hugging Face يدعم تحميل النماذج الكبيرة، مثل BLOOM.
+
+## إنشاء عروض توضيحية ويب من خطوط الأنابيب باستخدام `gradio`
+
+يتم دعم خطوط الأنابيب تلقائيًا في [Gradio](https://github.com/gradio-app/gradio/)، وهي مكتبة تجعل إنشاء تطبيقات تعليم الآلة الجميلة والسهلة الاستخدام على الويب أمرًا سهلاً. أولاً، تأكد من تثبيت Gradio:
+
+```
+pip install gradio
+```
+
+بعد ذلك، يمكنك إنشاء عرض توضيحي ويب حول خط أنابيب تصنيف الصور (أو أي خط أنابيب آخر) في سطر واحد من التعليمات البرمجية عن طريق استدعاء وظيفة [`Interface.from_pipeline`](https://www.gradio.app/docs/interface#interface-from-pipeline) في Gradio لإطلاق خط الأنابيب. يقوم هذا بإنشاء واجهة بديهية للسحب والإفلات في مستعرضك:
+
+```py
+from transformers import pipeline
+import gradio as gr
+
+pipe = pipeline("image-classification", model="google/vit-base-patch16-224")
+
+gr.Interface.from_pipeline(pipe).launch()
+```
+
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/panda-classification.png)
+
+بشكل افتراضي، يعمل العرض التوضيحي على خادم محلي. إذا كنت تريد مشاركتها مع الآخرين، فيمكنك إنشاء رابط عام مؤقت عن طريق تعيين `share=True` في `launch()`. يمكنك أيضًا استضافة عرضك التوضيحي على [Hugging Face Spaces](https://huggingface.co/spaces) للحصول على رابط دائم.
\ No newline at end of file
diff --git a/docs/source/ar/preprocessing.md b/docs/source/ar/preprocessing.md
new file mode 100644
index 000000000000..8c1f68934d20
--- /dev/null
+++ b/docs/source/ar/preprocessing.md
@@ -0,0 +1,521 @@
+# المعالجة المسبقة Preprocessing
+
+[[open-in-colab]]
+
+قبل تدريب نموذج على مجموعة بيانات، يجب معالجتها مسبقًا وفقًا تنسيق  المتوقع لمدخلات النموذج. سواء كانت بياناتك نصية أو صورًا أو صوتًا، فيجب تحويلها وتجميعها في دفعات من الموترات. يوفر 🤗 Transformers مجموعة من فئات المعالجة المسبقة للمساعدة في إعداد بياناتك للنموذج. في هذا البرنامج التعليمي، ستتعلم أنه بالنسبة لـ:
+
+* للنص، استخدم [مُجزّئ الرموز](./main_classes/tokenizer) لتحويل النص إلى تسلسل من الرموز، وإنشاء تمثيل رقمي للرموز، وتجميعها في موترات(tensors).
+* للكلام والصوت، استخدم [مستخرج الميزات](./main_classes/feature_extractor) لاستخراج ميزات متسلسلة من أشكال موجات الصوت وتحويلها إلى موترات.
+* تستخدم مدخلات الصورة [ImageProcessor](./main_classes/image_processor) لتحويل الصور إلى موترات.
+* تستخدم مدخلات متعددة الوسائط [معالجًا](./main_classes/processors) لدمج مُجزّئ الرموز ومستخرج الميزات أو معالج الصور.
+
+<Tip>
+
+`AutoProcessor` **يعمل دائمًا** ويختار تلقائيًا الفئة الصحيحة للنموذج الذي تستخدمه، سواء كنت تستخدم مُجزّئ رموز أو معالج صور أو مستخرج ميزات أو معالجًا.
+
+</Tip>
+
+قبل البدء، قم بتثبيت 🤗 Datasets حتى تتمكن من تحميل بعض مجموعات البيانات لتجربتها:
+
+```bash
+pip install datasets
+```
+
+## معالجة اللغة الطبيعية (Natural Language Processing (NLP
+
+<Youtube id="Yffk5aydLzg"/>
+
+أداة المعالجة المسبقة الرئيسية للبيانات النصية هي [مُجزّئ اللغوي](main_classes/tokenizer). يقوم مُجزّئ اللغوي بتقسيم النص إلى  "أجزاء لغوية" (tokens) وفقًا لمجموعة من القواعد. يتم تحويل الأجزاء اللغوية إلى أرقام ثم إلى منسوجات، والتي تصبح مدخلات للنموذج. يقوم المجزئ اللغوي بإضافة أي مدخلات إضافية يحتاجها النموذج.
+
+<Tip>
+
+إذا كنت تخطط لاستخدام نموذج مُدرب مسبقًا، فمن المهم استخدامالمجزئ اللغوي المقترن بنفس ذلك النموذج. يضمن ذلك تقسيم النص بنفس الطريقة التي تم بها تقسيم النصوص ما قبل التدريب، واستخدام نفس  القاموس الذي يربط بين الأجزاء اللغوية وأرقامها ( يُشار إليها عادةً باسم المفردات *vocab*) أثناء التدريب المسبق.
+
+</Tip>
+
+ابدأ بتحميل  المُجزّئ اللغوي مُدرب مسبقًا باستخدام طريقة [`AutoTokenizer.from_pretrained`]. يقوم هذا بتنزيل المفردات *vocab* الذي تم تدريب النموذج عليه:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+```
+
+ثم مرر نصك إلى المُجزّئ اللغوي:
+
+```py
+>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+>>> print(encoded_input)
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+يعيد المُجزّئ اللغوي قاموسًا يحتوي على ثلاثة عناصر مهمة:
+
+* [input_ids](glossary#input-ids) هي الفهارس المقابلة لكل رمز في الجملة.
+* [attention_mask](glossary#attention-mask) يشير إلى ما إذا كان يجب الانتباه بالرمز أم لا.
+* [token_type_ids](glossary#token-type-ids) يحدد التسلسل الذي ينتمي إليه الرمز عندما يكون هناك أكثر من تسلسل واحد.
+
+أعد إدخالك الأصلي عن طريق فك ترميز `input_ids`:
+
+```py
+>>> tokenizer.decode(encoded_input["input_ids"])
+'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
+```
+
+كما ترى، أضاف المُجزّئ اللغوي رمزين خاصين - `CLS` و`SEP` (مصنف وفاصل) - إلى الجملة. لا تحتاج جميع النماذج إلى
+رموز خاصة، ولكن إذا فعلوا ذلك، فإن المُجزّئ اللغوي يضيفها تلقائيًا لك.
+
+إذا كان هناك عدة جمل تريد معالجتها مسبقًا، فقم بتمريرها كقائمة إلى مُجزّئ اللغوي:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_inputs = tokenizer(batch_sentences)
+>>> print(encoded_inputs)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1]]}
+```
+
+### الحشو Padding
+
+لا تكون الجمل دائمًا بنفس الطول،  وهذا يمكن أن يمثل مشكلة لأن الموترات،وهي مدخلات النموذج، تحتاج إلى شكل موحد. الحشو هو استراتيجية لضمان أن تكون الموترات مستطيلة عن طريق إضافة رمز حشو *padding* خاص إلى الجمل الأقصر.
+
+قم بتعيين معلمة الحشو `padding` إلى `True` لحشو التسلسلات الأقصر في الدفعة لتطابق أطول تسلسل:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 0، 0، 0، 0، 0، 0، 0، 0]]}
+```
+
+تم الآن حشو الجملتين الأولى والثالثة بـ `0` لأنهما أقصر.
+
+### البتر Truncation
+
+وعلى صعيد أخر، قد يكون التسلسل طويلًا جدًا بالنسبة للنموذج للتعامل معه. في هذه الحالة، ستحتاج إلى بتر التسلسل إلى طول أقصر.
+
+قم بتعيين معلمة `truncation` إلى `True` لتقليم تسلسل إلى الطول الأقصى الذي يقبله النموذج:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0، 0، 0، 0، 0]]،
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0، 0، 0، 0],
+                    [1, 1, 1, 1, 1, 1, 1، 1، 1، 1، 1، 1، 1، 1، 1، 1],
+                    [1، 1، 1، 1، 1، 1، 1، 0، 0، 0، 0، 0، 0، 0، 0، 0]]}
+```
+
+<Tip>
+
+تحقق من دليل المفاهيم [Padding and truncation](./pad_truncation) لمعرفة المزيد حول معامﻻت الحشو و البتر المختلفة.
+
+</Tip>
+
+### بناء الموترات Build tensors
+
+أخيرًا، تريد أن يقوم  المجزئ اللغوي بإرجاع موترات (tensors) الفعلية التي ستُغذي النموذج.
+
+قم بتعيين معلمة `return_tensors` إلى إما `pt` لـ PyTorch، أو `tf` لـ TensorFlow:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+>>> print(encoded_input)
+{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]),
+ 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
+```
+</pt>
+<tf>
+ 
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+>>> print(encoded_input)
+{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+      dtype=int32)>,
+ 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
+ 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+تدعم خطوط الأنابيب المختلفة معامل مُجزِّئ الرموز(tokenizer) بشكل مختلف في طريقة `()__call__` الخاصة بها.
+و خطوط الأنابيب `text-2-text-generation` تدعم فقط `truncation`.
+و خطوط الأنابيب `text-generation` تدعم `max_length` و`truncation` و`padding` و`add_special_tokens`.
+أما في خطوط الأنابيب `fill-mask`، يمكن تمرير معامل مُجزِّئ الرموز (tokenizer) في المتغير `tokenizer_kwargs` (قاموس).
+
+</Tip>
+
+## الصوت Audio
+
+بالنسبة للمهام الصوتية، ستحتاج إلى [مستخرج الميزات](main_classes/feature_extractor) لإعداد مجموعة البيانات الخاصة بك للنماذج. تم تصميم مستخرج الميزات لاستخراج الميزات من بيانات الصوت الخام، وتحويلها إلى موتورات.
+
+قم بتحميل مجموعة بيانات [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) (راجع البرنامج التعليمي لـ 🤗 [Datasets](https://huggingface.co/docs/datasets/load_hub) لمزيد من التفاصيل حول كيفية تحميل مجموعة بيانات) لمعرفة كيفية استخدام مستخرج الميزات مع مجموعات البيانات الصوتية:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+الوصول إلى العنصر الأول من عمود `audio` لمعرفة المدخلات. يؤدي استدعاء عمود `audio` إلى تحميل ملف الصوت وإعادة أخذ العينات تلقائيًا:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+يعيد هذا ثلاثة عناصر:
+
+* `array` هو إشارة الكلام المحملة - وإعادة أخذ العينات المحتملة - كصفيف 1D.
+* `path` يشير إلى موقع ملف الصوت.
+* `sampling_rate` يشير إلى عدد نقاط البيانات في إشارة الكلام المقاسة في الثانية.
+
+بالنسبة لهذا البرنامج التعليمي، ستستخدم نموذج [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base). الق نظرة على بطاقة النموذج، وستتعلم أن Wav2Vec2 مُدرب مسبقًا على صوت الكلام الذي تم أخذ عينات منه بمعدل 16 كيلو هرتز. من المهم أن يتطابق معدل أخذ العينات لبيانات الصوت مع معدل أخذ العينات لمجموعة البيانات المستخدمة لتدريب النموذج مسبقًا. إذا لم يكن معدل أخذ العينات لبياناتك هو نفسه، فيجب إعادة أخذ العينات من بياناتك.
+
+1. استخدم طريقة [`~datasets.Dataset.cast_column`] في 🤗 Datasets لإعادة أخذ العينات بمعدل أخذ العينات 16 كيلو هرتز:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+2. استدعاء عمود `audio` مرة أخرى لأخذ عينات من ملف الصوت:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
+         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 16000}
+```
+
+بعد ذلك، قم بتحميل مستخرج الميزات لتطبيع وحشو المدخلات. عند إضافة حشو للبيانات النصية، تتم إضافة "0" للتسلسلات الأقصر. تنطبق نفس الفكرة على بيانات الصوت. يضيف مستخرج الميزات "0" - الذي يتم تفسيره على أنه صمت - إلى "array".
+
+قم بتحميل مستخرج الميزات باستخدام [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+مرر صفيف الصوت إلى مستخرج الميزات. كما نوصي بإضافة معامل `sampling_rate` في مستخرج الميزات من أجل تصحيح الأخطاء الصامتة التي قد تحدث بشكل أفضل.
+
+```py
+>>> audio_input = [dataset[0]["audio"]["array"]]
+>>> feature_extractor(audio_input, sampling_rate=16000)
+{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
+        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
+```
+
+تمامًا مثل مُجزِّئ الرموز، يمكنك تطبيق الحشو أو البتر للتعامل مع التسلسلات المتغيرة في دفعة. الق نظرة على طول التسلسل لهاتين العينتين الصوتيتين:
+
+```py
+>>> dataset[0]["audio"]["array"].shape
+(173398,)
+
+>>> dataset[1]["audio"]["array"].shape
+(106496,)
+```
+
+قم بإنشاء دالة لمعالجة مجموعة البيانات بحيث يكون للنماذج الصوتية نفس الأطوال. حدد أقصى طول للعينة ، وسيقوم مستخرج الميزات إما بإضافة حشو أو بتر التسلسلات لمطابقتها:
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays,
+...         sampling_rate=16000,
+...         padding=True,
+...         max_length=100000,
+...         truncation=True,
+...     )
+...     return inputs
+```
+
+قم بتطبيق `preprocess_function` على أول بضع أمثلة في مجموعة البيانات:
+
+```py
+>>> processed_dataset = preprocess_function(dataset[:5])
+```
+
+أطوال العينات الآن متساوية وتطابق الطول الأقصى المحدد. يمكنك الآن تمرير مجموعة البيانات المعالجة إلى النموذج!
+
+```py
+>>> processed_dataset["input_values"][0].shape
+(100000,)
+
+>>> processed_dataset["input_values"][1].shape
+(100000,)
+```
+
+## رؤية الكمبيوتر Computer vision
+
+بالنسبة لمهام رؤية الحاسوبية، ستحتاج إلى معالج صور [image processor](main_classes/image_processor) لإعداد مجموعة البيانات الخاصة بك لتناسب النموذج. تتكون معالجة الصور المسبقة من عدة خطوات لتحويل الصور إلى الشكل الذي يتوقعه النموذج. وتشمل هذه الخطوات، على سبيل المثال لا الحصر، تغيير الحجم والتطبيع وتصحيح قناة الألوان وتحويل الصور إلى موترات(tensors).
+
+<Tip>
+
+عادة ما تتبع معالجة الصور المسبقة شكلاً من أشكال زيادة البيانات (التضخيم). كلا العمليتين،  معالجة الصور المسبقة وزيادة الصور تغيران بيانات الصورة، ولكنها تخدم أغراضًا مختلفة:
+
+*زيادة البيانات: تغيير الصور عن طريق زيادة الصور بطريقة يمكن أن تساعد في منع الإفراط في التعميم وزيادة متانة النموذج. يمكنك أن تكون مبدعًا في كيفية زيادة بياناتك - ضبط السطوع والألوان، واالقص، والدوران، تغيير الحجم، التكبير، إلخ. ومع ذلك، كن حذرًا من عدم تغيير معنى الصور بزياداتك.
+*معالجة الصور المسبقة: تضمن معالجة الصور اتتطابق الصور مع تنسيق الإدخال المتوقع للنموذج. عند ضبط نموذج رؤية حاسوبية بدقة، يجب معالجة الصور بالضبط كما كانت عند تدريب النموذج في البداية.
+
+يمكنك استخدام أي مكتبة تريدها لزيادة بيانات الصور. لمعالجة الصور المسبقة، استخدم `ImageProcessor` المرتبط بالنموذج.
+
+</Tip>
+
+قم بتحميل مجموعة بيانات [food101](https://huggingface.co/datasets/food101) (راجع دليل 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) لمزيد من التفاصيل حول كيفية تحميل مجموعة بيانات) لمعرفة كيف يمكنك استخدام معالج الصور مع مجموعات بيانات رؤية الحاسب:
+
+<Tip>
+
+استخدم معامل `split` من 🤗 Datasets لتحميل عينة صغيرة فقط من مجموعة التدريب نظرًا لحجم البيانات كبيرة جدًا!
+
+</Tip>
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("food101", split="train[:100]")
+```
+
+بعد ذلك، الق نظرة على الصورة مع ميزة 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image):
+
+```py
+>>> dataset[0]["image"]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png"/>
+</div>
+
+قم بتحميل معالج الصور باستخدام [`AutoImageProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+أولاً، دعنا نضيف بعض الزيادات إلى الصور. يمكنك استخدام أي مكتبة تفضلها، ولكن في هذا الدليل، سنستخدم وحدة [`transforms`](https://pytorch.org/vision/stable/transforms.html) من torchvision. إذا كنت مهتمًا باستخدام مكتبة زيادة بيانات أخرى، فتعرف على كيفية القيام بذلك في [دفاتر Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) أو [دفاتر Kornia](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb).
+
+1. هنا نستخدم [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) لربط بعض التحولات معًا - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) و [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html).
+لاحظ بالنسبة لتغيير الحجم، يمكننا الحصول على متطلبات حجم الصورة من `image_processor`. بالنسبة لبعض النماذج، يُتوقع ارتفاع وعرض دقيقين، بينما بالنسبة للنماذج الأخرى، يتم تحديد  الحافة الأقصر`shortest_edge` فقط.
+
+```py
+>>> from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose
+
+>>> size = (
+...     image_processor.size["shortest_edge"]
+...     if "shortest_edge" in image_processor.size
+...     else (image_processor.size["height"], image_processor.size["width"])
+... )
+
+>>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)])
+```
+
+2. يقبل النموذج [`pixel_values`](model_doc/vision-encoder-decoder#transformers.VisionEncoderDecoderModel.forward.pixel_values)
+كإدخال له. يمكن لـ `ImageProcessor` التعامل مع تطبيع الصور، وتوليد موترات(tensors) مناسبة.
+قم بإنشاء دالة تجمع بين تضخيم بيانات الصور ومعالجة الصور المسبقة لمجموعة من الصور وتوليد `pixel_values`:
+
+```py
+>>> def transforms(examples):
+...     images = [_transforms(img.convert("RGB")) for img in examples["image"]]
+...     examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"]
+...     return examples
+```
+
+<Tip>
+
+في المثال أعلاه، قمنا بتعيين `do_resize=False` لأننا قمنا بالفعل بتغيير حجم الصور في تحويل زيادة الصور،
+واستفدنا من خاصية `size` من `image_processor` المناسب. إذا لم تقم بتغيير حجم الصور أثناء زيادة الصور،
+فاترك هذا المعلمة. بشكل افتراضي، ستتعامل `ImageProcessor` مع تغيير الحجم.
+
+إذا كنت ترغب في تطبيع الصور كجزء من تحويل زيادة الصور، فاستخدم قيم `image_processor.image_mean`،
+و `image_processor.image_std`.
+</Tip>
+
+3. ثم استخدم 🤗 Datasets[`~datasets.Dataset.set_transform`] لتطبيق التحولات أثناء التنقل:
+```py
+>>> dataset.set_transform(transforms)
+```
+
+4. الآن عند الوصول إلى الصورة، ستلاحظ أن معالج الصور قد أضاف `pixel_values`. يمكنك تمرير مجموعة البيانات المعالجة إلى النموذج الآن!
+
+```py
+>>> dataset[0].keys()
+```
+
+هكذا تبدو الصورة بعد تطبيق التحولات. تم اقتصاص الصورة بشكل عشوائي وتختلف خصائص الألوان بها.
+
+```py
+>>> import numpy as np
+>>> import matplotlib.pyplot as plt
+
+>>> img = dataset[0]["pixel_values"]
+>>> plt.imshow(img.permute(1, 2, 0))
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png"/>
+</div>
+
+<Tip>
+
+بالنسبة للمهام مثل الكشف عن الأشياء، والتجزئة الدلالية، والتجزئة المثالية، والتجزئة الشاملة، يوفر `ImageProcessor`
+تقوم هذه الطرق بتحويل النواتج الأولية للنموذج إلى تنبؤات ذات معنى مثل مربعات الحدود،
+أو خرائط التجزئة.
+
+</Tip>
+
+### الحشو Pad
+
+في بعض الحالات، على سبيل المثال، عند ضبط نموذج [DETR](./model_doc/detr) بدقة، يقوم النموذج بتطبيق زيادة المقياس أثناء التدريب. قد يتسبب ذلك في اختلاف أحجام الصور في دفعة واحدة. يمكنك استخدام [`DetrImageProcessor.pad`]
+من [`DetrImageProcessor`] وتحديد دالة `collate_fn` مخصصة لتجميع الصور معًا.
+
+```py
+>>> def collate_fn(batch):
+...     pixel_values = [item["pixel_values"] for item in batch]
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
+...     labels = [item["labels"] for item in batch]
+...     batch = {}
+...     batch["pixel_values"] = encoding["pixel_values"]
+...     batch["pixel_mask"] = encoding["pixel_mask"]
+...     batch["labels"] = labels
+...     return batch
+```
+
+## متعدد الوسائط Mulimodal
+
+بالنسبة للمهام التي تتطلب مدخلات متعددة الوسائط، ستحتاج إلى معالج [processor](main_classes/processors) لإعداد مجموعة البيانات الخاصة بك لتناسب النموذج. يقترن المعالج بين  بمعالجين آخرين مثل محول النص إلى رمز ومستخرج الميزات.
+
+قم بتحميل مجموعة بيانات [LJ Speech](https://huggingface.co/datasets/lj_speech) (راجع دليل 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) لمزيد من التفاصيل حول كيفية تحميل مجموعة بيانات) لمعرفة كيف يمكنك استخدام معالج للتعرف التلقائي على الكلام (ASR):
+
+```py
+>>> from datasets import load_dataset
+
+>>> lj_speech = load_dataset("lj_speech", split="train")
+```
+
+بالنسبة لـ ASR، فأنت تركز بشكل أساسي على `audio` و `text` لذا يمكنك إزالة الأعمدة الأخرى:
+
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+الآن الق نظرة على أعمدة `audio` و `text`:
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+الآن الق نظرة على أعمدة `audio` و `text`:
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+>>> lj_speech[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+تذكر أنه يجب عليك دائمًا [إعادة أخذ العينات](preprocessing#audio) لمعدل أخذ العينات في مجموعة البيانات الصوتية الخاصة بك لمطابقة معدل أخذ العينات في مجموعة البيانات المستخدمة لتدريب النموذج مسبقًا!
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+قم بتحميل معالج باستخدام [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+```
+
+1. قم بإنشاء دالة لمعالجة بيانات الصوت الموجودة في `array` إلى `input_values`، ورموز `text` إلى `labels`. هذه هي المدخلات للنموذج:
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+
+...     return example
+```
+
+2. قم بتطبيق دالة `prepare_dataset` على عينة:
+
+```py
+>>> prepare_dataset(lj_speech[0])
+```
+
+لقد أضاف المعالج الآن `input_values` و `labels`، وتم أيضًا إعادة أخذ العينات لمعدل أخذ العينات بشكل صحيح إلى 16 كيلو هرتز. يمكنك تمرير مجموعة البيانات المعالجة إلى النموذج الآن!
diff --git a/docs/source/ar/quicktour.md b/docs/source/ar/quicktour.md
new file mode 100644
index 000000000000..9a99c28287d6
--- /dev/null
+++ b/docs/source/ar/quicktour.md
@@ -0,0 +1,543 @@
+# جولة سريعة
+
+[[open-in-colab]]
+
+ابدأ رحلتك مع مكتبة 🤗 Transformers! سواء كنت مطورًا أو مستخدمًا عاديًا، ستساعدك هذه الجولة السريعة على البدء وستُظهر لك كيفية استخدام [`pipeline`] للاستنتاج، وتحميل نموذج مُدرب مسبقًا ومعالج مُسبق مع [AutoClass](./model_doc/auto)، وتدريب نموذج بسرعة باستخدام PyTorch أو TensorFlow. إذا كنت مبتدئًا، نوصي بالاطلاع على دروسنا أو [الدورة](https://huggingface.co/course/chapter1/1) للحصول على شرح أكثر تعمقًا للمفاهيم المقدمة هنا.
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+!pip install transformers datasets evaluate accelerate
+```
+
+ستحتاج أيضًا إلى تثبيت إطار عمل التعلم الآلي المفضل لديك:
+
+<frameworkcontent>
+<pt>
+
+```bash
+pip install torch
+```
+</pt>
+<tf>
+
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+## خط الأنابيب
+
+<Youtube id="tiZFewofSLM"/>
+
+يمثل [`pipeline`] أسهل وأسرع طريقة لاستخدام نموذج مُدرب مسبقًا للاستنتاج. يمكنك استخدام [`pipeline`] جاهزًا للعديد من المهام عبر طرق مختلفة، والتي يظهر بعضها في الجدول أدناه:
+
+<Tip>
+
+للاطلاع على القائمة الكاملة للمهام المتاحة، راجع [مرجع واجهة برمجة التطبيقات الخاصة بخط الأنابيب](./main_classes/pipelines).
+
+</Tip>
+
+<div dir="rtl">
+
+| **المهمة**                     | **الوصف**                                                                                              | **الطريقة**    | **معرف خط الأنابيب**                       |
+|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------|
+| تصنيف النص          | تعيين تسمية إلى تسلسل نص معين                                                                   | NLP             | pipeline(task=“sentiment-analysis”)           |
+| توليد النص              | توليد نص بناءً على موجه معين                                                                                 | NLP             | pipeline(task=“text-generation”)              |
+| تلخيص                | توليد ملخص لتسلسل نص أو مستند                                                         | NLP             | pipeline(task=“summarization”)                |
+| تصنيف الصور         | تعيين تسمية لصورة معينة                                                                                   | رؤية حاسوبية | pipeline(task=“image-classification”)         |
+| تجزئة الصورة           | تعيين تسمية لكل بكسل فردي في الصورة (يدعم التجزئة الدلالية، والمجملة، وتجزئة مثيلات) | رؤية حاسوبية | pipeline(task=“image-segmentation”)           |
+| اكتشاف الأشياء             | التنبؤ بحدود الأشياء وفئاتها في صورة معينة                                                | رؤية حاسوبية | pipeline(task=“object-detection”)             |
+| تصنيف الصوت         | تعيين تسمية لبيانات صوتية معينة                                                                            | صوتي           | pipeline(task=“audio-classification”)         |
+| التعرف على الكلام التلقائي | نسخ الكلام إلى نص                                                                                  | صوتي           | pipeline(task=“automatic-speech-recognition”) |
+| الإجابة على الأسئلة البصرية    | الإجابة على سؤال حول الصورة، مع إعطاء صورة وسؤال                                             | متعدد الوسائط      | pipeline(task=“vqa”)                          |
+| الإجابة على أسئلة المستندات  | الإجابة على سؤال حول المستند، مع إعطاء مستند وسؤال                                        | متعدد الوسائط      | pipeline(task="document-question-answering")  |
+| كتابة تعليق على الصورة             | إنشاء تعليق على صورة معينة                                                                         | متعدد الوسائط      | pipeline(task="image-to-text")                |
+
+</div>
+ابدأ بإنشاء مثيل من [`pipeline`] وتحديد المهمة التي تريد استخدامه لها. في هذا الدليل، ستستخدم خط الأنابيب للتحليل النصي كنموذج:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+يقوم [`pipeline`] بتنزيل وتخزين نسخة احتياطية من نموذج افتراضي [مُدرب مسبقًا](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) ومعالج للتحليل النصي. الآن يمكنك استخدام `classifier` على النص المستهدف:
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
+إذا كان لديك أكثر من إدخال واحد، قم بتمرير إدخالاتك كقائمة إلى [`pipeline`] لإرجاع قائمة من القواميس:
+
+```py
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
+>>> for result in results:
+...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+label: POSITIVE, with score: 0.9998
+label: NEGATIVE, with score: 0.5309
+```
+يمكن لخط الأنابيب أيضًا أن يتنقل خلال مجموعة بيانات كاملة لأي مهمة تريدها. كمثال على ذلك، دعنا نختار التعرف على الكلام التلقائي كمهمة لنا:
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+```
+
+قم بتحميل مجموعة بيانات صوتية (راجع دليل البدء السريع لـ 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart#audio) للحصول على مزيد من التفاصيل) التي تريد التنقل خلالها. على سبيل المثال، قم بتحميل مجموعة بيانات [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+```
+
+يجب التأكد من أن نفس الجودة الصوتية (معدل أخذ العينات) لمجموعة البيانات يتطابق مع معدل أخذ العينات الذي تم تدريب [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) عليه:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+```
+
+يتم تحميل الملفات الصوتية وإعادة تشكيلها تلقائيًا عند استدعاء العمود "audio".
+استخرج المصفوفات الموجية الخام من أول 4 عينات ومررها كقائمة إلى خط الأنابيب:
+
+```py
+>>> result = speech_recognizer(dataset[:4]["audio"])
+>>> print([d["text"] for d in result])
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT']
+```
+
+بالنسبة لمجموعات البيانات الكبيرة  التي تحتوي على مدخلات ضخمة (كما هو الحال في البيانات الصوتية أو المرئية)، يفضل تمرير مولد (generator) بدلاً من قائمة لتحميل جميع المدخلات في الذاكرة دفعة واحدة. راجع [مرجع واجهة برمجة التطبيقات الخاصة بخط الأنابيب](./main_classes/pipelines) للحصول على مزيد من المعلومات.
+
+### ااستخدم نموذجًا ومجزئًا آخرين في خط الأنابيب
+
+يمكن لخط الأنابيب [`pipeline`] استيعاب أي نموذج من [Hub](https://huggingface.co/models)، مما يسهل التكيف مع حالات الاستخدام الأخرى. على سبيل المثال، إذا كنت تريد نموذجًا قادرًا على التعامل مع النص الفرنسي، فاستخدم العلامات على Hub لفلتره نموذج مناسب. تعيد النتيجة الأولى المرشحة نموذج BERT متعدد اللغات [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) الذي تم ضبطه مسبقًا للتحليل المشاعر والذي يمكنك استخدامه للنص الفرنسي:
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+استخدم [`AutoModelForSequenceClassification`] و [`AutoTokenizer`] لتحميل النموذج المُدرب مسبقًا ومعالجته المرتبط به (مزيد من المعلومات حول `AutoClass` في القسم التالي):
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+استخدم [`TFAutoModelForSequenceClassification`] و [`AutoTokenizer`] لتحميل النموذج المُدرب مسبقًا ومعالجته المرتبط به (مزيد من المعلومات حول `TFAutoClass` في القسم التالي):
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+حدد النموذج والمعالج في [`pipeline`]. الآن يمكنك تطبيق `classifier` على النص الفرنسي:
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+إذا لم تجد نموذجًا جاهزًا يناسب مهمتك، فستحتاج إلى ضبط نموذج مُدرب مسبقًا على بياناتك. اطلع على [دليل الضبط الدقيق](./training) للتعرف على كيفية القيام بذلك. وبعد ضبط نموذجك المُدرب مسبقًا، يرجى مراعاة [المشاركة](./model_sharing) النموذج مع المجتمع على Hub لمساعدة الجميع في مجال التعلم الآلي! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+في الخلفية، تعمل فئتا [`AutoModelForSequenceClassification`] و [`AutoTokenizer`] معًا لتشغيل دالة pipeline()  الذي استخدمتها أعلاه. تعتبر [AutoClass](./model_doc/auto) اختصارًا يقوم تلقائيًا باسترداد بنية نموذج مُدرب مسبقًا من اسمه أو مساره. كل ما عليك فعله هو تحديد فئة `AutoClass` المناسبة لمهمتك وفئة المعالجة المرتبطة بها.
+
+لنعد إلى المثال من القسم السابق ولنرى كيف يمكنك استخدام `AutoClass` لتكرار نتائج خط الأنابيب.
+
+### المجزئ التلقائي (AutoTokenizer)
+
+يتولى المجزئ مسؤولية تحويل النص إلى مصفوفة من الأرقام (رموز) يمكن للنموذج فهمها ومعالجتها. هناك قواعد متعددة تحكم عملية التجزئة، بما في ذلك كيفية تقسيم كلمة وما هو المستوى الذي يجب أن تقسيم الكلمات عنده (تعرف على المزيد حول المعالجة في [ملخص المجزئ](./tokenizer_summary)). أهم شيء يجب تذكره هو أنك تحتاج إلى إنشاء مثيل للمجزئ بنفس اسم النموذج لضمان استخدامك لقواعد التجزئة نفسها التي تم تدريب النموذج عليها.
+
+قم بتحميل المجزئ باستخدام [`AutoTokenizer`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+مرر نصك إلى المجزئ:
+
+```py
+>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+>>> print(encoding)
+{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+يعيد المجزئ قاموسًا يحتوي على:
+
+* [input_ids](./glossary#input-ids): التمثيلات الرقمية لرموزك.
+* [attention_mask](./glossary#attention-mask): تشير إلى الرموز التي يجب الانتباه بها.
+
+يمكن المجزئ أيضًا قبول قائمة من المدخلات، ويقوم بـ "حشو" و"تقصير" النص لإرجاع كدفعة بطول موحد:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+اطلع على [الدليل التمهيدي للمعالجة المسبقة](./preprocessing) للحصول على مزيد من التفاصيل حول المعالجة، وكيفية استخدام [`AutoImageProcessor`] و [`AutoFeatureExtractor`] و [`AutoProcessor`] لمعالجة الصور والصوت والإدخالات متعددة الوسائط.
+
+</Tip>
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+تقدم مكتبة 🤗 Transformers طريقة بسيطة وموحدة لتحميل نماذج مدربة مسبقًا. وهذا يعني أنه يمكنك تحميل [`AutoModel`] كما لو كنت تقوم بتحميل [`AutoTokenizer`]. الفرق الوحيد هو اختيار فئة [`AutoModel`] المناسبة للمهمة. بالنسبة لتصنيف النص (أو التسلسل)، يجب عليك تحميل [`AutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+راجع [ملخص المهمة](./task_summary) للاطلاع على المهام التي تدعمها فئة [`AutoModel`].
+
+</Tip>
+
+الآن قم بتمرير دفعة المدخلات المُعالجة مسبقًا مباشرة إلى النموذج. عليك فقط فك تعبئة القاموس عن طريق إضافة `**`:
+
+# تدريب النموذج
+
+الآن، مرر دفعة المدخلات المعالجة مسبقًا مباشرة إلى النموذج. ما عليك سوى فك تعبئة القاموس عن طريق إضافة `**`:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+يُخرج النموذج التنشيطات النهائية في سمة `logits`. طبق دالة softmax على `logits` للحصول على الاحتمالات:
+
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+يوفر 🤗 Transformers طريقة بسيطة وموحدة لتحميل مثيلات مُدربة مسبقًا. وهذا يعني أنه يمكنك تحميل [`TFAutoModel`] مثل تحميل [`AutoTokenizer`]. والفرق الوحيد هو تحديد [`TFAutoModel`] الصحيح للمهمة. للتصنيف النصي (أو التسلسلي)، يجب تحميل [`TFAutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+راجع [ملخص المهام](./task_summary) للمهام المدعومة بواسطة فئة [`AutoModel`].
+
+</Tip>
+
+الآن، مرر دفعة المدخلات المعالجة مسبقًا مباشرة إلى النموذج. يمكنك تمرير المصفوفات كما هي:
+
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+يقوم النموذج بإخراج التنشيطات النهائية في سمة `logits`. طبق دالة softmax على `logits` لاسترداد الاحتمالات:
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+تخرج جميع نماذج 🤗 Transformers (PyTorch أو TensorFlow) المصفوفات *قبل* دالة التنشيط النهائية (مثل softmax) لأن دالة التنشيط النهائية غالبًا ما تكون مدمجة مع دالة الخسارة. نواتج النموذج عبارة عن فئات بيانات خاصة، لذلك يتم استكمال سماتها تلقائيًا في IDE. وتتصرف مخرجات النموذج مثل زوج مرتب أو قاموس (يمكنك الفهرسة باستخدام عدد صحيح ، شريحة، أو سلسلة)، وفي هذه الحالة، يتم تجاهل السمات التي تساوي None.
+
+</Tip>
+
+### حفظ النموذج
+
+<frameworkcontent>
+<pt>
+بمجرد ضبط نموذجك، يمكنك حفظه مع برنامج الترميز الخاص به باستخدام [`PreTrainedModel.save_pretrained`]:
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+عندما تكون مستعدًا لاستخدام النموذج مرة أخرى، أعد تحميله باستخدام [`PreTrainedModel.from_pretrained`]:
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+بمجرد ضبط نموذجك، يمكنك حفظه مع برنامج الترميز الخاص به باستخدام [`TFPreTrainedModel.save_pretrained`]:
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+عندما تكون مستعدًا لاستخدام النموذج مرة أخرى، أعد تحميله باستخدام [`TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+من الميزات الرائعة في 🤗 Transformers القدرة على حفظ نموذج وإعادة تحميله كنموذج PyTorch أو TensorFlow. يمكن أن يحول معامل `from_pt` أو `from_tf` النموذج من إطار عمل إلى آخر:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</tf>
+</frameworkcontent>
+
+
+## إنشاء نماذج مخصصة
+
+يمكنك تعديل فئة تكوين النموذج لتغيير كيفية بناء النموذج. يحدد التكوين سمات النموذج، مثل عدد الطبقات المخفية أو رؤوس الاهتمام. تبدأ من الصفر عند تهيئة نموذج من فئة تكوين مخصصة. يتم تهيئة سمات النموذج بشكل عشوائي، ويجب تدريب النموذج قبل استخدامه للحصول على نتائج ذات معنى.
+
+ابدأ باستيراد [`AutoConfig`]. ثم قم بتحميل النموذج المُدرب مسبقًا الذي تريد تعديله. ضمن [`AutoConfig.from_pretrained`]. يمكنك تحديد السمة التي تريد تغييرها، مثل عدد رؤوس الاهتمام:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
+```
+
+<frameworkcontent>
+<pt>
+قم بإنشاء نموذج من تكوينك المخصص باستخدام [`AutoModel.from_config`]:
+
+```py
+>>> from transformers import AutoModel
+
+>>> my_model = AutoModel.from_config(my_config)
+```
+</pt>
+<tf>
+قم بإنشاء نموذج من تكوينك المخصص باستخدام [`TFAutoModel.from_config`]:
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> my_model = TFAutoModel.from_config(my_config)
+```
+</tf>
+</frameworkcontent>
+
+الق نظرة على دليل [إنشاء بنية مخصصة](./create_a_model) لمزيد من المعلومات حول بناء التكوينات المخصصة.
+
+## المدرب - حلقة تدريب محسنة لـ PyTorch
+
+جميع النماذج عبارة عن [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) قياسية، لذا يمكنك استخدامها في أي حلقة تدريب نموذجية. في حين يمكنك كتابة حلقة التدريب الخاصة بك، يوفر 🤗 Transformers فئة [`Trainer`] لـ PyTorch، والتي تحتوي على حلقة التدريب الأساسية وتضيف وظائف إضافية لميزات مثل التدريب الموزع، والدقة المختلطة، والمزيد.
+
+وفقًا لمهمتك، ستقوم عادةً بتمرير المعلمات التالية إلى [`Trainer`]:
+
+1. ستبدأ بـ [`PreTrainedModel`] أو [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module):
+
+   ```py
+   >>> from transformers import AutoModelForSequenceClassification
+
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+   ```
+
+2. تحتوي [`TrainingArguments`] على فرط معلمات النموذج التي يمكنك تغييرها مثل معدل التعلم، وحجم الدفعة، وعدد العصور التي يجب التدريب عليها. يتم استخدام القيم الافتراضية إذا لم تحدد أي حجج تدريب:
+
+   ```py
+   >>> from transformers import TrainingArguments
+
+   >>> training_args = TrainingArguments(
+   ...     output_dir="path/to/save/folder/",
+   ...     learning_rate=2e-5,
+   ...     per_device_train_batch_size=8,
+   ...     per_device_eval_batch_size=8,
+   ...     num_train_epochs=2,
+   ... )
+   ```
+
+3. قم بتحميل فئة معالجة مسبقة مثل برنامج الترميز، أو معالج الصور، أو مستخرج الميزات، أو المعالج:
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+   ```
+
+4. قم بتحميل مجموعة بيانات:
+
+   ```py
+   >>> from datasets import load_dataset
+
+   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
+   ```
+
+5. قم بإنشاء دالة لترميز مجموعة البيانات:
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])
+   ```
+
+   ثم قم بتطبيقه على مجموعة البيانات بأكملها باستخدام [`~datasets.Dataset.map`]:
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset, batched=True)
+   ```
+
+6. [`DataCollatorWithPadding`] لإنشاء دفعة من الأمثلة من مجموعة البيانات الخاصة بك:
+
+   ```py
+   >>> from transformers import DataCollatorWithPadding
+
+   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+   ```
+
+الآن قم بتجميع جميع هذه الفئات في [`Trainer`]:
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=dataset["train"],
+...     eval_dataset=dataset["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )  # doctest: +SKIP
+```
+عندما تكون مستعدًا، استدعِ [`~Trainer.train`] لبدء التدريب:
+
+```py
+>>> trainer.train()  # doctest: +SKIP
+```
+
+<Tip>
+
+بالنسبة للمهام - مثل الترجمة أو التلخيص - التي تستخدم نموذج تسلسل إلى تسلسل، استخدم فئات [`Seq2SeqTrainer`] و [`Seq2SeqTrainingArguments`] بدلاً من ذلك.
+
+</Tip>
+
+يمكنك تخصيص سلوك حلقة التدريب عن طريق إنشاء فئة فرعية من الطرق داخل [`Trainer`]. يسمح لك ذلك بتخصيص ميزات مثل دالة الخسارة، والمحسن، والمجدول. راجع مرجع [`Trainer`] للتعرف على الطرق التي يمكن إنشاء فئات فرعية منها.
+
+والطريقة الأخرى لتخصيص حلقة التدريب هي باستخدام [المستدعيات](./main_classes/callback). يمكنك استخدام المستدعيات للتكامل مع المكتبات الأخرى ومراقبة حلقة التدريب للإبلاغ عن التقدم أو إيقاف التدريب مبكرًا. لا تعدل المستدعيات أي شيء في حلقة التدريب نفسها. لتخصيص شيء مثل دالة الخسارة، تحتاج إلى إنشاء فئة فرعية من [`Trainer`] بدلاً من ذلك.
+
+## التدريب باستخدام TensorFlow
+
+جميع النماذج عبارة عن [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) قياسية، لذا يمكن تدريبها في TensorFlow باستخدام واجهة برمجة تطبيقات Keras. يوفر 🤗 Transformers طريقة [`~TFPreTrainedModel.prepare_tf_dataset`] لتحميل مجموعة البيانات الخاصة بك بسهولة كـ `tf.data.Dataset` حتى تتمكن من البدء في التدريب على الفور باستخدام دالتي `compile` و`fit` في Keras.
+
+1. ستبدأ بـ [`TFPreTrainedModel`] أو [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model):
+
+   ```py
+   >>> from transformers import TFAutoModelForSequenceClassification
+
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+   ```
+
+2. قم بتحميل فئة معالجة مسبقة مثل برنامج الترميز، أو معالج الصور، أو مستخرج الميزات، أو المعالج:
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+   ```
+
+3. قم بإنشاء دالة لترميز مجموعة البيانات:
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
+   ```
+
+4. قم بتطبيق برنامج الترميز على مجموعة البيانات بأكملها باستخدام [`~datasets.Dataset.map`] ثم مرر مجموعة البيانات وبرنامج الترميز إلى [`~TFPreTrainedModel.prepare_tf_dataset`]. يمكنك أيضًا تغيير حجم الدفعة وخلط مجموعة البيانات هنا إذا أردت:
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
+   >>> tf_dataset = model.prepare_tf_dataset(
+   ...     dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
+   ... )  # doctest: +SKIP
+   ```
+
+5. عندما تكون مستعدًا، يمكنك استدعاء `compile` و`fit` لبدء التدريب. لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذا فأنت لست بحاجة إلى تحديد واحدة ما لم ترغب في ذلك:
+
+   ```py
+   >>> from tensorflow.keras.optimizers import Adam
+
+   >>> model.compile(optimizer='adam')  # لا توجد وسيطة دالة الخسارة!
+   >>> model.fit(tf_dataset)  # doctest: +SKIP
+   ```
+
+## ماذا بعد؟
+
+الآن بعد أن أكملت الجولة السريعة في 🤗 Transformers، راجع أدلتنا لمعرفة كيفية القيام بأشياء أكثر تحديدًا مثل كتابة نموذج مخصص، وضبط نموذج مسبق التدريب لمهمة معينة، وكيفية تدريب نموذج باستخدام نص برمجي. إذا كنت مهتمًا بمعرفة المزيد عن المفاهيم الأساسية لـ 🤗 Transformers، فاحصل على فنجان من القهوة واطلع على أدلة المفاهيم الخاصة بنا!
diff --git a/docs/source/ar/run_scripts.md b/docs/source/ar/run_scripts.md
new file mode 100644
index 000000000000..593d4aec85fc
--- /dev/null
+++ b/docs/source/ar/run_scripts.md
@@ -0,0 +1,351 @@
+# التدريب باستخدام نص برمجى
+
+بالإضافة إلى دفاتر الملاحظات [notebooks](./notebooks) الخاصة بـ 🤗 Transformers، هناك أيضًا نصوص برمجية توضيحية تُظهر كيفية تدريب نموذج لمهمة باستخدام [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch) أو [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) أو [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+
+كما ستجد النصوص البرمجية التي استخدمناها في [مشاريع الأبحاث](https://github.com/huggingface/transformers/tree/main/examples/research_projects) و [الأمثلة القديمة](https://github.com/huggingface/transformers/tree/main/examples/legacy) والتي ساهم بها المجتمع بشكل أساسي. هذه النصوص البرمجية غير مدعومة بشكل نشط وقد تتطلب إصدارًا محددًا من مكتبة 🤗 Transformers والذي من المحتمل أن يكون غير متوافق مع الإصدار الأحدث من المكتبة.
+
+لا يُتوقع أن تعمل النصوص البرمجية التوضيحية بشكل مباشر على كل مشكلة، وقد تحتاج إلى تكييف النص البرمجي مع المشكلة التي تحاول حلها. ولمساعدتك في ذلك، تعرض معظم النصوص البرمجية كيفية معالجة البيانات قبل التدريب بشكل كامل، مما يتيح لك تحريرها حسب الحاجة لحالتك الاستخدام.
+
+بالنسبة لأي ميزة ترغب في تنفيذها في نص برمجي توضيحي، يرجى مناقشتها في [المنتدى](https://discuss.huggingface.co/) أو في [قضية](https://github.com/huggingface/transformers/issues) قبل إرسال طلب سحب. وفي حين أننا نرحب بإصلاح الأخطاء، فمن غير المرجح أن نقوم بدمج طلب سحب الذي يضيف المزيد من الوظائف على حساب قابلية القراءة.
+
+سيوضح هذا الدليل كيفية تشغيل نص برمجي توضيحي للتدريب على التلخيص في [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) و [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). يُتوقع أن تعمل جميع الأمثلة مع كلا الإطارين ما لم يُنص على خلاف ذلك.
+
+## الإعداد
+
+لتشغيل الإصدار الأحدث من النصوص البرمجية التوضيحية بنجاح، يجب عليك **تثبيت 🤗 Transformers من المصدر** في بيئة افتراضية جديدة:
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+بالنسبة للإصدارات الأقدم من النصوص البرمجية التوضيحية، انقر فوق الزر أدناه:
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+بالنسبة للإصدارات الأقدم من النصوص البرمجية التوضيحية، انقر فوق الزر أدناه:
+
+<details>
+  <summary>أمثلة للإصدارات الأقدم من 🤗 Transformers</summary>
+	<ul>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
+	</ul>
+</details>
+
+ثم قم بالتبديل إلى النسخة الحالية من 🤗 Transformers إلى إصدار محدد، مثل v3.5.1 على سبيل المثال:
+
+```bash
+git checkout tags/v3.5.1
+```
+
+بعد إعداد إصدار المكتبة الصحيح، انتقل إلى مجلد الأمثلة الذي تختاره وقم بتثبيت المتطلبات المحددة:
+
+```bash
+pip install -r requirements.txt
+```
+
+## تشغيل نص برمجي
+
+<frameworkcontent>
+<pt>
+    
+- يقوم النص البرمجي التوضيحي بتنزيل مجموعة بيانات ومعالجتها مسبقًا من مكتبة 🤗 [Datasets](https://huggingface.co/docs/datasets).
+- ثم يقوم النص البرمجي بضبط نموذج بيانات دقيق باستخدام [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) على بنية تدعم الملخص. 
+- يوضح المثال التالي كيفية ضبط نموذج [T5-small](https://huggingface.co/google-t5/t5-small) على مجموعة بيانات [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail).
+- يتطلب نموذج T5 معامل `source_prefix` إضافية بسبب الطريقة التي تم تدريبه بها. يتيح هذا المطالبة لـ T5 معرفة أن هذه مهمة التلخيص.
+    
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+    
+- يقوم النص البرمجي التوضيحي بتنزيل مجموعة بيانات ومعالجتها مسبقًا من مكتبة 🤗 [Datasets](https://huggingface.co/docs/datasets/).
+- ثم يقوم النص البرمجي بضبط نموذج بيانات دقيق باستخدام Keras على بنية تدعم الملخص.
+- يوضح المثال التالي كيفية ضبط نموذج [T5-small](https://huggingface.co/google-t5/t5-small) على مجموعة بيانات [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail).
+- يتطلب نموذج T5 ماعمل `source_prefix` إضافية بسبب الطريقة التي تم تدريبه بها. يتيح هذا المطالبة لـ T5 معرفة أن هذه مهمة التلخيص.
+
+```bash
+python examples/tensorflow/summarization/run_summarization.py  \
+    --model_name_or_path google-t5/t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## التدريب الموزع والدقة المختلطة
+
+يدعم [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) التدريب الموزع والدقة المختلطة، مما يعني أنه يمكنك أيضًا استخدامه في نص برمجي. لتمكين كلتا الميزتين:
+
+- أضف معامل `fp16` لتمكين الدقة المختلطة.
+- قم بتعيين عدد وحدات معالجة الرسومات (GPUs) التي تريد استخدامها باستخدام حجة `nproc_per_node`.
+
+```bash
+torchrun \
+    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
+    --fp16 \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+تستخدم نصوص TensorFlow البرمجية استراتيجية [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) للتدريب الموزع، ولا تحتاج إلى إضافة أي معامﻻت إضافية إلى النص البرمجي التدريبي. سيستخدم نص TensorFlow البرمجي وحدات معالجة الرسومات (GPUs) متعددة بشكل افتراضي إذا كانت متوفرة.
+
+## تشغيل نص برمجي على وحدة معالجة الدقة الفائقة (TPU)
+
+<frameworkcontent>
+<pt>
+    
+تُعد وحدات معالجة الدقة الفائقة (TPUs) مصممة خصيصًا لتسريع الأداء. يدعم PyTorch وحدات معالجة الدقة الفائقة (TPUs) مع [XLA](https://www.tensorflow.org/xla) مجمع الدقة الفائقة للتعلم العميق (راجع [هنا](https://github.com/pytorch/xla/blob/master/README.md) لمزيد من التفاصيل). لاستخدام وحدة معالجة الدقة الفائقة (TPU)، قم بتشغيل نص `xla_spawn.py` البرمجي واستخدم معامل `num_cores` لتعيين عدد وحدات معالجة الدقة الفائقة (TPU) التي تريد استخدامها.
+
+```bash
+python xla_spawn.py --num_cores 8 \
+    summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+    
+تُعد وحدات معالجة الدقة الفائقة (TPUs) مصممة خصيصًا لتسريع الأداء. تستخدم نصوص TensorFlow البرمجية استراتيجية [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) للتدريب على وحدات معالجة الدقة الفائقة (TPUs). لاستخدام وحدة معالجة الدقة الفائقة (TPU)، قم بتمرير اسم مورد وحدة معالجة الدقة الفائقة (TPU) إلى حجة `tpu`.
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    --model_name_or_path google-t5/t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## تشغيل نص برمجي باستخدام 🤗 Accelerate
+
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) هي مكتبة خاصة بـ PyTorch فقط توفر طريقة موحدة لتدريب نموذج على عدة أنواع من الإعدادات (الاعتماد على وحدة المعالجة المركزية (CPU) فقط، أو وحدات معالجة الرسومات (GPUs) المتعددة، أو وحدات معالجة الدقة الفائقة (TPUs)) مع الحفاظ على الرؤية الكاملة لحلقة تدريب PyTorch. تأكد من تثبيت 🤗 Accelerate إذا لم يكن لديك بالفعل:
+
+> ملاحظة: نظرًا لأن Accelerate في حالة تطوير سريع، يجب تثبيت إصدار Git من Accelerate لتشغيل النصوص البرمجية.
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+بدلاً من إستخدام النص البرمجي `run_summarization.py`  يجب عليك استخدام النص البرمجي `run_summarization_no_trainer.py` . ستكون النصوص البرمجية المدعومة من 🤗 Accelerate لها ملف `task_no_trainer.py` في المجلد. ابدأ بتشغيل الأمر التالي لإنشاء وحفظ ملف تكوين:
+
+```bash
+accelerate config
+```
+
+اختبر إعدادك للتأكد من أنه تم تكوينه بشكل صحيح:
+
+```bash
+accelerate test
+```
+
+الآن أنت مستعد لبدء التدريب:
+
+```bash
+accelerate launch run_summarization_no_trainer.py \
+    --model_name_or_path google-t5/t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir ~/tmp/tst-summarization
+```
+
+## استخدام مجموعة بيانات مخصصة
+
+يدعم النص البرمجي للتلخيص مجموعة بيانات مخصصة طالما أنها ملف CSV أو JSON Line. عندما تستخدم مجموعة بياناتك الخاصة، تحتاج إلى تحديد العديد من المعلمات الإضافية:
+
+- `train_file` و`validation_file` يحددان مسار ملفات التدريب والتحقق الخاصة بك.
+- `text_column`  النص المدخل الذي سيتم تلخيصه.
+- `summary_column`  النص الملخص المستهدف الذي سيتم إخراجه.
+
+سيبدو النص البرمجي للتلخيص الذي يستخدم مجموعة بيانات مخصصة على النحو التالي:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --train_file path_to_csv_or_jsonlines_file \
+    --validation_file path_to_csv_or_jsonlines_file \
+    --text_column text_column_name \
+    --summary_column summary_column_name \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate
+```
+
+## اختبار البرنامج النصي
+
+من الجيد غالبًا تشغيل نصك البرمجي على عدد أقل من أمثلة مجموعة البيانات للتأكد من أن كل شيء يعمل كما هو متوقع قبل الالتزام بمجموعة بيانات كاملة والتي قد تستغرق ساعات لإكمالها. استخدم المعلمات التالية لتقليص مجموعة البيانات إلى عدد أقصى من العينات:
+
+- `max_train_samples`
+- `max_eval_samples`
+- `max_predict_samples`
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+لا تدعم جميع أمثلة النصوص البرمجية المعلمة `max_predict_samples`. إذا لم تكن متأكدًا مما إذا كان نصك البرمجي يدعم هذه المعلمة، فأضف معلمة `-h` للتحقق:
+
+```bash
+examples/pytorch/summarization/run_summarization.py -h
+```
+
+## استئناف التدريب من نقطة تفتيش
+
+خيار آخر مفيد لتمكينه هو استئناف التدريب من نقطة تفتيش سابقة. سيضمن ذلك أنك تستطيع الاستمرار من حيث توقفت دون البدء من جديد إذا تم مقاطعة تدريبك. هناك طريقتان لاستئناف التدريب من نقطة تفتيش.
+
+تستخدم الطريقة الأولى المعلمة `output_dir previous_output_dir` لاستئناف التدريب من أحدث نقطة تفتيش مخزنة في `output_dir`. في هذه الحالة، يجب عليك إزالة `overwrite_output_dir`:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --output_dir previous_output_dir \
+    --predict_with_generate
+```
+
+تستخدم الطريقة الثانية معلمة `resume_from_checkpoint path_to_specific_checkpoint` لاستئناف التدريب من مجلد نقطة تفتيش محددة.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate
+```
+
+## شارك نموذجك
+
+يمكن لجميع النصوص البرمجية رفع نموذجك النهائي إلى [مركز النماذج](https://huggingface.co/models). تأكد من تسجيل الدخول إلى Hugging Face قبل البدء:
+
+```bash
+huggingface-cli login
+```
+
+ثم أضف المعلمة `push_to_hub` إلى النص البرمجي . ستقوم هذه المعلمة بإنشاء مستودع باستخدام اسم مستخدم Hugging Face واسم المجلد المحدد في `output_dir`.
+
+لإعطاء مستودعك اسمًا محددًا، استخدم المعلمة `push_to_hub_model_id` لإضافته. سيتم عرض المستودع تلقائيًا ضمن مساحة الاسم الخاصة بك.
+
+يوضح المثال التالي كيفية رفع نموذج باستخدام اسم مستودع محدد:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
diff --git a/docs/source/ar/training.md b/docs/source/ar/training.md
new file mode 100644
index 000000000000..d3e354ff8b1a
--- /dev/null
+++ b/docs/source/ar/training.md
@@ -0,0 +1,412 @@
+# ضبط نموذج مُدرب مسبقًا
+
+هناك فوائد كبيرة لاستخدام نموذج مُدرب مسبقًا. فهو يقلل من تكاليف الحوسبة، ويحد من أثرنا البيئي، ويتيح لك استخدام أحدث النماذج دون الحاجة إلى تدريبها من الصفر. توفر مكتبة 🤗 Transformers إمكانية الوصول إلى آلاف النماذج المُدربة مسبقًا لمجموعة واسعة من المهام. عندما تستخدم نموذجًا مُدربًا مسبقًا، فإنك تقوم بتدريبه على مجموعة بيانات خاصة بمهمتك. يُعرف ذلك بالضبط الدقيق، وهي تقنية تدريب قوية للغاية. في هذا البرنامج التعليمي، سوف تقوم بضبط نموذج مُدرب مسبقًا باستخدام إطار عمل للتعلم العميق الذي تختاره:
+
+* ضبط نموذج مُدرب مسبقًا باستخدام 🤗 Transformers [`Trainer`].
+* ضبط نموذج مُدرب مسبقًا في TensorFlow باستخدام Keras.
+* ضبط نموذج مُدرب مسبقًا في PyTorch الأصلي.
+
+<a id='data-processing'></a>
+
+## إعداد مجموعة بيانات
+
+قبل أن تتمكن من ضبط نموذج مُدرب مسبقًا، قم بتنزيل مجموعة بيانات وإعدادها للتدريب. أظهر البرنامج التعليمي السابق كيفية معالجة البيانات للتدريب، والآن لديك الفرصة لاختبار تلك المهارات!
+
+ابدأ بتحميل مجموعة بيانات [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset["train"][100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+كما تعلم الآن، تحتاج إلى محول نص إلى رمز (tokenizer) لمعالجة النص وتضمين استراتيجيات للحشو والقص للتعامل مع أي أطوال متسلسلة متغيرة. لمعالجة مجموعة البيانات الخاصة بك في خطوة واحدة، استخدم طريقة 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process#map) لتطبيق دالة معالجة مسبقة على مجموعة البيانات بأكملها:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+
+إذا كنت ترغب، يمكنك إنشاء مجموعة فرعية أصغر من مجموعة البيانات الكاملة لضبطها لتقليل الوقت الذي تستغرقه:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## التدريب
+
+في هذه المرحلة، يجب عليك اتباع القسم الذي يتوافق مع الإطار الذي تريد استخدامه. يمكنك استخدام الروابط
+في شريط التنقل الأيمن للقفز إلى الإطار الذي تريده - وإذا كنت تريد إخفاء كل المحتوى لإطار معين،
+فاستخدم الزر في الركن العلوي الأيمن من كتلة الإطار!
+
+<frameworkcontent>
+<pt>
+<Youtube id="nvBXf7s7vTI"/>
+
+## التدريب باستخدام PyTorch Trainer
+
+تقدم مكتبة 🤗 Transformers فئة [`Trainer`] مُحسّنة لتدريب نماذج 🤗 Transformers، مما يسهل بدء التدريب دون الحاجة إلى كتابة حلقة التدريب الخاصة بك يدويًا. تدعم واجهة برمجة تطبيقات [`Trainer`] مجموعة واسعة من خيارات التدريب والميزات مثل التسجيل، وتراكم التدرجات، والدقة المختلطة.
+
+ابدأ بتحميل نموذجك وتحديد عدد التصنيفات المتوقعة. من بطاقة مجموعة بيانات Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields)، تعرف أنه يوجد خمسة تصنيفات:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+سترى تحذيرًا بشأن بعض أوزان النموذج المُدرب مسبقًا لن تُستخدم وبعض الأوزان الأخرى ستُبدء بشكل عشوائي. لا تقلق، هذا أمر طبيعي تمامًا! يتم التخلص من رأس النموذج المُدرب مسبقًا لشبكة BERT، ويتم استبداله برأس تصنيف يُبدء بشكل عشوائي. سوف تقوم بضبط الرأس الجديد للنموذج بدقة على مهمة تصنيف التسلسلات الخاصة بك، مما ينقل المعرفة من النموذج المُدرب مسبقًا إليه.
+
+</Tip>
+
+### اختيار أحسن العوامل والمتغيرات للتدريب (Training hyperparameters)
+
+بعد ذلك، قم بإنشاء كائن من فئة [`TrainingArguments`] والتي تحتوي على جميع العوامل والمتغيرات التي يمكنك ضبطها بالإضافة إلى خيارات تنشيط التدريب المختلفة. بالنسبة لهذا البرنامج التعليمي، يمكنك البدء بمعاملات التدريب الافتراضية [hyperparameters](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments)، ولكن لا تتردد في تجربتها للعثور على الإعدادات المثلى.
+
+حدد مكان حفظ النسخ من تدريبك:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### التقييم
+
+لا يقوم [`Trainer`] تلقائيًا بتقييم أداء النموذج أثناء التدريب. ستحتاج إلى تمرير دالة إلى [`Trainer`] لحساب وإبلاغ المقاييس. توفر مكتبة [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) دالة [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) بسيطة يمكنك تحميلها باستخدام الدالة [`evaluate.load`] (راجع هذا [الدليل السريع](https://huggingface.co/docs/evaluate/a_quick_tour) لمزيد من المعلومات):
+
+```py
+>>> import numpy as np
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+```
+
+استدعِ دالة [`~evaluate.compute`] على `metric` لحساب دقة تنبؤاتك. قبل تمرير تنبؤاتك إلى دالة `compute`، تحتاج إلى تحويل  النتائج الخام logits إلى تنبؤات نهائية (تذكر أن جميع نماذج 🤗 Transformers تعيد نتائج الخام logits):
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits، labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+إذا كنت ترغب في مراقبة مقاييس التقييم الخاصة بك أثناء الضبط الدقيق، فحدد معلمة `eval_strategy` في معاملات التدريب الخاصة بك لإظهار مقياس التقييم في نهاية كل حقبة تدريبه:
+
+```py
+>>> from transformers import TrainingArguments, Trainer
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
+```
+
+### المدرب
+
+قم بإنشاء كائن [`Trainer`] باستخدام نموذجك، ومعاملات التدريب، ومجموعات البيانات التدريبية والاختبارية، ودالة التقييم:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+ثم قم بضبط نموذجك عن طريق استدعاء [`~transformers.Trainer.train`]:
+
+```py
+>>> trainer.train()
+```
+</pt>
+<tf>
+<a id='keras'></a>
+
+<Youtube id="rnTGBy2ax1c"/>
+
+## تدريب نموذج TensorFlow باستخدام Keras
+
+يمكنك أيضًا تدريب نماذج 🤗 Transformers في TensorFlow باستخدام واجهة برمجة تطبيقات Keras!
+
+### تحميل البيانات لـ Keras
+
+عندما تريد تدريب نموذج 🤗 Transformers باستخدام واجهة برمجة تطبيقات Keras، فأنت بحاجة إلى تحويل مجموعة البيانات الخاصة بك إلى تنسيق يفهمه
+Keras. إذا كانت مجموعة البيانات الخاصة بك صغيرة، فيمكنك ببساطة تحويلها بالكامل إلى مصفوفات NumPy وإرسالها إلى Keras.
+دعونا نجرب ذلك أولاً قبل أن نقوم بأي شيء أكثر تعقيدًا.
+
+أولاً، قم بتحميل مجموعة بيانات. سنستخدم مجموعة بيانات CoLA من معيار [GLUE benchmark](https://huggingface.co/datasets/glue)،
+نظرًا لأنه مهمة تصنيف نص ثنائي بسيطة، وسنأخذ فقط قسم التدريب الآن.
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("glue"، "cola")
+dataset = dataset ["train"] # خذ فقط قسم التدريب الآن
+```
+
+بعد ذلك، قم بتحميل أداة المُجزّئ اللغوي وقم بترميز البيانات كمصفوفات NumPy. لاحظ أن التصنيفات هي بالفعل قائمة من 0 و 1،
+لذا يمكننا ببساطة تحويل ذلك مباشرة إلى مصفوفة NumPy بدون ترميز!
+
+```py
+from transformers import AutoTokenizer
+import numpy as np
+
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
+tokenized_data = dict(tokenized_data)
+
+labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
+```
+
+أخيرًا، قم بتحميل وتجميع وتناسب النموذج. لاحظ أن نماذج Transformers تحتوي جميعها على دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذا فأنت لست بحاجة إلى تحديد واحدة ما لم ترغب في ذلك:
+
+```py
+from transformers import TFAutoModelForSequenceClassification
+from tensorflow.keras.optimizers import Adam
+
+# تحميل وتجميع النموذج الخاص بنا
+model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased")
+# معدلات التعلم المنخفضة أفضل غالبًا لضبط النماذج الدقيقة
+model.compile(optimizer=Adam(3e-5)) # لا توجد دالة خسارة!
+
+model.fit(tokenized_data, labels)
+```
+
+<Tip>
+
+أنت لست مضطرًا لتمرير دالة خسارة إلى نماذجك عند تجميعها! تختار نماذج Hugging Face تلقائيًا
+دالة خسارة مناسبة لمهمتها وهندسة نموذجها إذا تُركت هذه الحجة فارغة. يمكنك دائمًا
+تجاوز ذلك عن طريق تحديد دالة خسارة بنفسك إذا كنت تريد ذلك!
+
+</Tip>
+
+يعمل هذا النهج بشكل رائع لمجموعات البيانات الصغيرة، ولكن بالنسبة لمجموعات البيانات الأكبر، فقد تجد أنه يصبح مشكلة. لماذا؟
+لأن المصفوفة المرمزة والتصنيفات يجب أن يتم تحميلها بالكامل في الذاكرة، ولأن NumPy لا يتعامل مع
+المصفوفات"غير المنتظمة"، لذا حشو كل عينة  إلى طول أطول عينة في مجموعة البيانات بأكملها. سيؤدي ذلك إلى زيادة حجم المصفوفة لديك، وستبطئ الرموز الزائده من عملية التدريب أيضًا!
+
+### تحميل البيانات كـ tf.data.Dataset
+
+إذا كنت تريد تجنب إبطاء التدريب، فيمكنك تحميل بياناتك كـ `tf.data.Dataset` بدلاً من ذلك. على الرغم من أنه يمكنك كتابة خط أنابيب `tf.data` الخاص بك إذا كنت تريد، إلا أن لدينا طريقتين مختصرتين للقيام بذلك:
+- [`~TFPreTrainedModel.prepare_tf_dataset`]: هذه هي الطريقة التي نوصي بها في معظم الحالات. نظرًا لأنه طريقة
+على نموذجك، فيمكنه فحص النموذج لتحديد الأعمدة القابلة للاستخدام كمدخلات للنموذج تلقائيًا،
+واستبعاد الأعمدة الأخرى لإنشاء مجموعة بيانات أبسط وأكثر كفاءة.
+- [`~datasets.Dataset.to_tf_dataset`]: هذه الطريقة أكثر أساسية، وهي مفيدة عندما تريد التحكم بدقة في كيفية
+إنشاء مجموعة البيانات الخاصة بك، عن طريق تحديد أعمدة `columns` و `label_cols` المحددة التي سيتم تضمينها.
+
+قبل أن تتمكن من استخدام [`~TFPreTrainedModel.prepare_tf_dataset`]، ستحتاج إلى إضافة مخرجات المُجزئ إلى مجموعة البيانات الخاصة بك كأعمدة، كما هو موضح في
+عينة التعليمات البرمجية التالية:
+
+```py
+def tokenize_dataset (data):
+# ستتم إضافة مفاتيح القاموس الذي تمت إعادته كأعمدة إلى مجموعة البيانات
+return tokenizer(data["text"])
+
+
+dataset = dataset.map(tokenize_dataset)
+```
+
+تذكر أن مجموعات بيانات Hugging Face يتم تخزينها على القرص بشكل افتراضي، لذا فلن يؤدي ذلك إلى تضخيم استخدام الذاكرة لديك! بمجرد إضافة الأعمدة، يمكنك بث الدفعات من مجموعة البيانات وإضافة الترميز إلى كل دفعة، مما يقلل بشكل كبير من عدد رموز الترقيم مقارنة بترميز مجموعة البيانات بأكملها.
+
+
+```py
+>>> tf_dataset = model.prepare_tf_dataset(dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer)
+```
+
+لاحظ أنه في عينة التعليمات البرمجية أعلاه، تحتاج إلى تمرير المُجزئ اللغوي إلى `prepare_tf_dataset` حتى تتمكن من حشو الدُفعات بشكل صحيح أثناء تحميلها.
+إذا كانت جميع العينات في مجموعة البيانات الخاصة بك بنفس الطول ولم يكن الترميز ضروريًا، فيمكنك تخطي هذا المعامل.
+إذا كنت بحاجة إلى القيام بشيء أكثر تعقيدًا من مجرد ترميز العينات (على سبيل المثال، إفساد الرموز للنمذجة اللغوية المُقنعة)،
+فيمكنك استخدام معامل `collate_fn` بدلاً من ذلك لتمرير دالة يتم استدعاؤها لتحويل
+قائمة العينات إلى دفعة وتطبيق أي معالجة مسبقة تريدها. راجع أمثلةنا [examples](https://github.com/huggingface/transformers/tree/main/examples) أو
+[دفاتر الملاحظات](https://huggingface.co/docs/transformers/notebooks) لرؤية هذا النهج في العمل.
+
+بمجرد إنشاء `tf.data.Dataset`، يمكنك تجميع النموذج وتناسبه كما هو الحال من قبل:
+
+```py
+model.compile(optimizer=Adam(3e-5))  # No loss argument!
+
+model.fit(tf_dataset)
+```
+
+</tf>
+</frameworkcontent>
+
+<a id='pytorch_native'></a>
+## تدريب في PyTorch الأصلي
+
+<frameworkcontent>
+<pt>
+<Youtube id="Dh9CL8fyG80"/>
+
+[`Trainer`] يهتم بحلقة التدريب ويسمح لك بضبط نموذج في سطر واحد من التعليمات البرمجية. بالنسبة للمستخدمين الذين يفضلون كتابة حلقة التدريب الخاصة بهم، يمكنك أيضًا ضبط نموذج 🤗 Transformers في PyTorch الأصلي.
+
+في هذه المرحلة، قد تحتاج إلى إعادة تشغيل دفتر الملاحظات الخاص بك أو تنفيذ التعليمات البرمجية التالية لتحرير بعض الذاكرة:
+
+```py
+del model
+del trainer
+torch.cuda.empty_cache()
+```
+
+بعد ذلك، قم بمعالجة `tokenized_dataset` يدويًا لإعداده للتدريب.
+
+1. إزالة عمود `text` لأن النموذج لا يقبل النص الخام كإدخال:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. إعادة تسمية عمود `label` إلى `labels` لأن النموذج يتوقع أن يكون الاسم `labels`:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label"، "labels")
+    ```
+
+3. قم بتعيين تنسيق مجموعة البيانات لإرجاع مؤشرات PyTorch بدلاً من القوائم:
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+بعد ذلك، قم بإنشاء مجموعة فرعية أصغر من مجموعة البيانات كما هو موضح سابقًا لتسريع الضبط الدقيق:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader
+
+قم بإنشاء `DataLoader` لمجموعات بيانات التدريب والاختبار الخاصة بك حتى تتمكن من التكرار عبر دفعات البيانات:
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset، shuffle=True، batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset، batch_size=8)
+```
+
+قم بتحميل نموذجك مع عدد التصنيفات المتوقعة:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased"، num_labels=5)
+```
+
+### المحسن ومخطط معدل التعلم
+
+قم بإنشاء محسن ومخطط معدل تعلم لضبط النموذج الدقيق. دعنا نستخدم [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) المحسن من PyTorch:
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters()، lr=5e-5)
+```
+
+قم بإنشاء مخطط معدل التعلم الافتراضي من [`Trainer`]:
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear"، optimizer=optimizer، num_warmup_steps=0، num_training_steps=num_training_steps
+... )
+```
+
+أخيرًا، حدد `device` لاستخدام وحدة معالجة الرسومات (GPU) إذا كان لديك حق الوصول إليها. وإلا، فقد يستغرق التدريب على وحدة المعالجة المركزية (CPU) عدة ساعات بدلاً من دقائق قليلة.
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+احصل على وصول مجاني إلى وحدة معالجة رسومات سحابية إذا لم يكن لديك واحدة مع دفتر ملاحظات مستضاف مثل [Colaboratory](https://colab.research.google.com/) أو [SageMaker StudioLab](https://studiolab.sagemaker.aws/).
+
+</Tip>
+
+رائع، الآن أنت مستعد للتدريب! 🥳 
+
+### حلقة التدريب
+
+لمراقبة تقدم التدريب الخاص بك، استخدم مكتبة [tqdm](https://tqdm.github.io/) لإضافة شريط تقدم فوق عدد خطوات التدريب:
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k، v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### تقييم
+
+تمامًا كما أضفت وظيفة تقييم إلى [`Trainer`]]، تحتاج إلى القيام بنفس الشيء عندما تكتب حلقة التدريب الخاصة بك. ولكن بدلاً من حساب الإبلاغ عن المقياس في نهاية كل حقبة، هذه المرة ستقوم بتجميع جميع الدفعات باستخدام [`~evaluate.add_batch`] وحساب المقياس في النهاية.
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k، v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits، dim=-1)
+...     metric.add_batch(predictions=predictions، references=batch["labels"])
+
+>>> metric.compute()
+```
+</pt>
+</frameworkcontent>
+
+<a id='additional-resources'></a>
+
+## موارد إضافية
+
+لمزيد من الأمثلة على الضبط الدقيق، راجع:
+
+- [🤗 أمثلة المحولات](https://github.com/huggingface/transformers/tree/main/examples) تتضمن
+  النصوص البرمجية لتدريب مهام NLP الشائعة في PyTorch وTensorFlow.
+
+- [🤗 دفاتر ملاحظات المحولات](notebooks) يحتوي على دفاتر ملاحظات مختلفة حول كيفية ضبط نموذج لمهمة محددة في PyTorch وTensorFlow.
\ No newline at end of file

From 98adf24883b007c2a7fb17bab1c01b1614673433 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Mon, 16 Sep 2024 19:05:17 +0200
Subject: [PATCH 15/67] [Whisper test] Fix some failing tests (#33450)

* Fix failing tensor placement in Whisper

* fix long form generation tests

* more return_timestamps=True

* make fixup

* [run_slow] whisper

* [run_slow] whisper
---
 tests/models/whisper/test_modeling_whisper.py | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 09be23a0d381..ebc9ce5ec358 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1683,9 +1683,9 @@ def test_labels_sequence_max_length_correct(self):
             input_features = input_dict["input_features"]
 
             labels_length = config.max_target_positions
-            labels = torch.ones(1, labels_length, dtype=torch.int64)
+            labels = torch.ones(1, labels_length, dtype=torch.int64).to(torch_device)
 
-            model = model_class(config)
+            model = model_class(config).to(torch_device)
             model(input_features=input_features, labels=labels)
 
     def test_labels_sequence_max_length_correct_after_changing_config(self):
@@ -1697,9 +1697,9 @@ def test_labels_sequence_max_length_correct_after_changing_config(self):
             config.max_target_positions += 100
 
             labels_length = config.max_target_positions
-            labels = torch.ones(1, labels_length, dtype=torch.int64)
+            labels = torch.ones(1, labels_length, dtype=torch.int64).to(torch_device)
 
-            model = model_class(config)
+            model = model_class(config).to(torch_device)
             model(input_features=input_features, labels=labels)
 
     def test_labels_sequence_max_length_error(self):
@@ -1709,9 +1709,9 @@ def test_labels_sequence_max_length_error(self):
             input_features = input_dict["input_features"]
 
             labels_length = config.max_target_positions + 1
-            labels = torch.ones(1, labels_length, dtype=torch.int64)
+            labels = torch.ones(1, labels_length, dtype=torch.int64).to(torch_device)
 
-            model = model_class(config)
+            model = model_class(config).to(torch_device)
             with self.assertRaises(ValueError):
                 model(input_features=input_features, labels=labels)
 
@@ -1719,11 +1719,11 @@ def test_labels_sequence_max_length_error_after_changing_config(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_generative_model_classes:
-            model = model_class(config)
+            model = model_class(config).to(torch_device)
             input_features = input_dict["input_features"]
 
             labels_length = config.max_target_positions + 1
-            labels = torch.ones(1, labels_length, dtype=torch.int64)
+            labels = torch.ones(1, labels_length, dtype=torch.int64).to(torch_device)
 
             new_max_length = config.max_target_positions + 100
             model.config.max_length = new_max_length
@@ -2385,7 +2385,9 @@ def test_tiny_token_timestamp_generation_longform(self):
         )
 
         inputs = inputs.to(torch_device)
-        generate_outputs = model.generate(**inputs, return_segments=True, return_token_timestamps=True)
+        generate_outputs = model.generate(
+            **inputs, return_segments=True, return_token_timestamps=True, return_timestamps=True
+        )
 
         token_timestamps_shape = [
             [segment["token_timestamps"].shape for segment in segment_list]
@@ -2550,14 +2552,14 @@ def test_default_multilingual_transcription_long_form(self):
         ).input_features.to(torch_device)
 
         # task defaults to transcribe
-        sequences = model.generate(input_features)
+        sequences = model.generate(input_features, return_timestamps=True)
 
         transcription = processor.batch_decode(sequences)[0]
 
         assert transcription == " मिर्ची में कितने विबिन्द प्रजातियां हैं? मिर्ची में कितने विबिन्द प्रजातियां हैं?"
 
         # set task to translate
-        sequences = model.generate(input_features, task="translate")
+        sequences = model.generate(input_features, task="translate", return_timestamps=True)
         transcription = processor.batch_decode(sequences)[0]
 
         assert (
@@ -3264,6 +3266,7 @@ def test_whisper_empty_longform(self):
             "num_beams": 5,
             "language": "fr",
             "task": "transcribe",
+            "return_timestamps": True,
         }
 
         torch.manual_seed(0)

From 4ba531c43f884cea2575f1f2c1b287173a28fb1e Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 17 Sep 2024 08:31:24 +0800
Subject: [PATCH 16/67] Fix: Qwen2-VL training on video datasets (#33307)

* fix video finetuning

* Update modeling_qwen2_vl.py

* Update modeling_qwen2_vl.py

* fix
---
 .../models/qwen2_vl/modeling_qwen2_vl.py       | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index 5838c460e7c5..4e4e04198c0b 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -1680,16 +1680,18 @@ def forward(
             inputs_embeds = self.model.embed_tokens(input_ids)
             if pixel_values is not None:
                 pixel_values = pixel_values.type(self.visual.get_dtype())
-                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
-                image_mask = input_ids == self.config.image_token_id
-                if self.training:
-                    inputs_embeds = inputs_embeds.clone()
-                inputs_embeds[image_mask] = image_embeds
+                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds)
+                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+
             if pixel_values_videos is not None:
                 pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
-                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw).to(inputs_embeds.device)
-                video_mask = input_ids == self.config.video_token_id
-                inputs_embeds[video_mask] = video_embeds
+                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1).expand_as(inputs_embeds)
+                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+
             if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
 

From ba1f1dc132257f0e92d2266f966dc19758a3f2b3 Mon Sep 17 00:00:00 2001
From: Steven Shimizu <shimizust@gmail.com>
Date: Mon, 16 Sep 2024 17:40:24 -0700
Subject: [PATCH 17/67] Updated Trainer's liger-kernel integration to call
 correct patching API (#33502)

* Updated liger-kernel integration in Trainer to call correct patching API

* Fixed styling
---
 src/transformers/trainer.py            | 13 ++++++-----
 src/transformers/utils/import_utils.py |  2 +-
 tests/trainer/test_trainer.py          | 30 +++++++++++++++-----------
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index f815c50d597f..97a052093652 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -468,19 +468,18 @@ def __init__(
 
         if self.args.use_liger_kernel:
             if is_liger_kernel_available():
-                from liger_kernel.transformers.trainer_integration import _apply_liger_kernel
+                from liger_kernel.transformers import _apply_liger_kernel_to_instance
 
-                model_type = getattr(model, "config", None) and getattr(model.config, "model_type", None)
-                if model_type:
-                    # Monkey patch the model with liger kernels. Use the default kernel configurations.
-                    _apply_liger_kernel(model_type=model_type)
+                if isinstance(model, PreTrainedModel):
+                    # Patch the model with liger kernels. Use the default kernel configurations.
+                    _apply_liger_kernel_to_instance(model=model)
                 else:
                     logger.warning(
-                        "The model does not have a valid `model_type` specified. No liger kernels will be applied."
+                        "The model is not an instance of PreTrainedModel. No liger kernels will be applied."
                     )
             else:
                 raise ImportError(
-                    "You have set `use_liger_kernel` to `True` but liger-kernel >= 0.1.0 is not available. "
+                    "You have set `use_liger_kernel` to `True` but liger-kernel >= 0.3.0 is not available. "
                     "Please install it with `pip install liger-kernel`"
                 )
 
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index af8a56944346..ad8b649aaa4e 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -1187,7 +1187,7 @@ def is_liger_kernel_available():
     if not _liger_kernel_available:
         return False
 
-    return version.parse(importlib.metadata.version("liger_kernel")) >= version.parse("0.1.0")
+    return version.parse(importlib.metadata.version("liger_kernel")) >= version.parse("0.3.0")
 
 
 # docstyle-ignore
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 1837d9890352..791486ec8374 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1344,22 +1344,28 @@ def test_get_eval_dataloader_with_persistent_workers(self):
 
     @require_liger_kernel
     def test_use_liger_kernel_patching(self):
-        # Test that the model code actually gets patched with Liger kernel
-        from liger_kernel.transformers.rms_norm import LigerRMSNorm
+        # Ensure any monkey patching is cleaned up for subsequent tests
+        with patch("transformers.models.llama.modeling_llama"):
+            from liger_kernel.transformers import LigerRMSNorm, liger_rotary_pos_emb
 
-        from transformers.models.llama import modeling_llama
+            from transformers.models.llama import modeling_llama
 
-        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
-        tiny_llama = LlamaForCausalLM(config)
+            config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+            tiny_llama = LlamaForCausalLM(config)
 
-        args = TrainingArguments(
-            "./test",
-            use_liger_kernel=True,
-        )
-        Trainer(tiny_llama, args)
+            # Spot check that modeling code and model instance variables are not yet patched
+            self.assertNotEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
+            self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))
+
+            args = TrainingArguments(
+                "./test",
+                use_liger_kernel=True,
+            )
+            Trainer(tiny_llama, args)
 
-        # Check that one of the Llama model layers has been correctly patched with Liger kernel
-        self.assertEqual(modeling_llama.LlamaRMSNorm, LigerRMSNorm)
+            # Spot check that modeling code and model instance variables are patched
+            self.assertEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
+            self.assertTrue(isinstance(tiny_llama.model.norm, LigerRMSNorm))
 
     @require_liger_kernel
     @require_torch_gpu

From 9f196ef2e084c53c01f8e754b7c29153bd9a626a Mon Sep 17 00:00:00 2001
From: hlky <hlky@hlky.ac>
Date: Tue, 17 Sep 2024 03:13:06 +0100
Subject: [PATCH 18/67] Replace `accelerator.use_fp16` in examples (#33513)

* Replace `accelerator.use_fp16` in examples

* pad_to_multiple_of=16 for fp8
---
 .../pytorch/multiple-choice/run_swag_no_trainer.py    | 11 ++++++++---
 .../run_qa_beam_search_no_trainer.py                  |  9 ++++++++-
 .../pytorch/question-answering/run_qa_no_trainer.py   |  9 ++++++++-
 .../summarization/run_summarization_no_trainer.py     |  8 +++++++-
 .../text-classification/run_glue_no_trainer.py        |  9 ++++++++-
 .../token-classification/run_ner_no_trainer.py        | 11 ++++++++---
 .../pytorch/translation/run_translation_no_trainer.py |  9 ++++++++-
 .../research_projects/luke/run_luke_ner_no_trainer.py | 11 ++++++++---
 .../self-training-text-classification/finetuning.py   |  9 ++++++++-
 .../run_{{cookiecutter.example_shortcut}}.py          |  9 ++++++++-
 10 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 8f7693ae5b0d..3987b6d20d5e 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -473,9 +473,14 @@ def preprocess_function(examples):
         # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
         # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
         # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorForMultipleChoice(
-            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
-        )
+        # For fp8, we pad to multiple of 16.
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
+        data_collator = DataCollatorForMultipleChoice(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
 
     train_dataloader = DataLoader(
         train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index ee791c0c8ddf..f8e2f56f8e08 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -670,7 +670,14 @@ def prepare_validation_features(examples):
         # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
         # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
         # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
+        # For fp8, we pad to multiple of 16.
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
 
     train_dataloader = DataLoader(
         train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 7ae0d488bc40..f0a22e51637d 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -685,7 +685,14 @@ def prepare_validation_features(examples):
         # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
         # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
         # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
+        # For fp8, we pad to multiple of 16.
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
 
     train_dataloader = DataLoader(
         train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 36cd590ea5c9..21da10700052 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -534,11 +534,17 @@ def preprocess_function(examples):
         logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
 
     label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    if accelerator.mixed_precision == "fp8":
+        pad_to_multiple_of = 16
+    elif accelerator.mixed_precision != "no":
+        pad_to_multiple_of = 8
+    else:
+        pad_to_multiple_of = None
     data_collator = DataCollatorForSeq2Seq(
         tokenizer,
         model=model,
         label_pad_token_id=label_pad_token_id,
-        pad_to_multiple_of=8 if accelerator.use_fp16 else None,
+        pad_to_multiple_of=pad_to_multiple_of,
     )
 
     def postprocess_text(preds, labels):
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index ac62edbe5e9f..da9193ab1cfa 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -426,7 +426,14 @@ def preprocess_function(examples):
         # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
         # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
         # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
+        # For fp8, we pad to multiple of 16.
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
 
     train_dataloader = DataLoader(
         train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 2afb38bb44b4..77016e2a6cb8 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -541,9 +541,14 @@ def tokenize_and_align_labels(examples):
         # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
         # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
         # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorForTokenClassification(
-            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
-        )
+        # For fp8, we pad to multiple of 16.
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
+        data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
 
     train_dataloader = DataLoader(
         train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 97da3f9541d9..70ef92284db0 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -517,11 +517,18 @@ def preprocess_function(examples):
         # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
         # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
         # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+        # For fp8, we pad to multiple of 16.
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
         data_collator = DataCollatorForSeq2Seq(
             tokenizer,
             model=model,
             label_pad_token_id=label_pad_token_id,
-            pad_to_multiple_of=8 if accelerator.use_fp16 else None,
+            pad_to_multiple_of=pad_to_multiple_of,
         )
 
     train_dataloader = DataLoader(
diff --git a/examples/research_projects/luke/run_luke_ner_no_trainer.py b/examples/research_projects/luke/run_luke_ner_no_trainer.py
index cac487b059d7..1552acbd42c2 100644
--- a/examples/research_projects/luke/run_luke_ner_no_trainer.py
+++ b/examples/research_projects/luke/run_luke_ner_no_trainer.py
@@ -542,9 +542,14 @@ def tokenize_and_align_labels(examples):
         # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
         # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
         # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorForLukeTokenClassification(
-            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
-        )
+        # For fp8, we pad to multiple of 16.
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
+        data_collator = DataCollatorForLukeTokenClassification(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
 
     train_dataloader = DataLoader(
         train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
diff --git a/examples/research_projects/self-training-text-classification/finetuning.py b/examples/research_projects/self-training-text-classification/finetuning.py
index 0afff6a91ead..4bf9eb28df28 100644
--- a/examples/research_projects/self-training-text-classification/finetuning.py
+++ b/examples/research_projects/self-training-text-classification/finetuning.py
@@ -704,7 +704,14 @@ def preprocess_function(examples):
         # precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple of
         # 8s, which will enable the use of Tensor Cores on NVIDIA hardware with
         # compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
+        # For fp8, we pad to multiple of 16.
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
 
     train_dataloader = DataLoader(
         train_dataset,
diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
index 5c39262cc938..0b27b4921293 100755
--- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
@@ -836,7 +836,14 @@ def tokenize_function(examples):
         # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
         # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
         # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
+        # For fp8, we pad to multiple of 16.
+        if accelerator.mixed_precision == "fp8":
+            pad_to_multiple_of = 16
+        elif accelerator.mixed_precision != "no":
+            pad_to_multiple_of = 8
+        else:
+            pad_to_multiple_of = None
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_multiple_of)
 
     train_dataloader = DataLoader(
         train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size

From 18e1a9c7195543ec7b7314fcd995bc7aad559e66 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Tue, 17 Sep 2024 08:05:21 +0200
Subject: [PATCH 19/67] Fix parametrization-based weight norm (#33275)

* refactor weight_norm + propose uniformed solution to reconcile meta load_state_dict with classic loading

* make style

* fix sew

* fix sew and sew_d tests
---
 src/transformers/modeling_utils.py            | 33 ++++++++++----
 src/transformers/models/dac/modeling_dac.py   | 44 ++++++++++---------
 .../models/encodec/modeling_encodec.py        | 13 +++++-
 .../modeling_fastspeech2_conformer.py         | 18 +++++---
 .../seamless_m4t/modeling_seamless_m4t.py     | 18 +++++---
 .../modeling_seamless_m4t_v2.py               | 18 +++++---
 src/transformers/models/sew/modeling_sew.py   |  8 +++-
 .../models/sew_d/modeling_sew_d.py            |  8 +++-
 .../models/speecht5/modeling_speecht5.py      | 18 +++++---
 .../models/univnet/modeling_univnet.py        | 38 ++++++++++++----
 src/transformers/models/vits/modeling_vits.py | 14 ++++--
 tests/models/sew/test_modeling_sew.py         |  1 +
 tests/models/sew_d/test_modeling_sew_d.py     |  1 +
 13 files changed, 166 insertions(+), 66 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 359509f469a7..d40697666360 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -818,7 +818,6 @@ def _move_model_to_meta(model, loaded_state_dict_keys, start_prefix):
 def _load_state_dict_into_meta_model(
     model,
     state_dict,
-    loaded_state_dict_keys,  # left for now but could be removed, see below
     start_prefix,
     expected_keys,
     device_map=None,
@@ -847,8 +846,6 @@ def _load_state_dict_into_meta_model(
     # - deepspeed zero 3 support
     # - need to copy metadata if any - see _load_state_dict_into_model
     # - handling error_msgs - mimicking the error handling in module._load_from_state_dict()
-    # - Is there a situation where some keys aren't in `loaded_state_dict_keys` and in which case
-    #   they won't get loaded.
 
     error_msgs = []
 
@@ -868,6 +865,18 @@ def _load_state_dict_into_meta_model(
             # We add only the first key as an example
             new_key = key.replace("beta", "bias")
             renamed_beta[key] = new_key if not renamed_beta else renamed_beta
+
+        # To reproduce `_load_state_dict_into_model` behaviour, we need to manually rename parametrized weigth norm, if necessary.
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            if "weight_g" in key:
+                new_key = key.replace("weight_g", "parametrizations.weight.original0")
+            if "weight_v" in key:
+                new_key = key.replace("weight_v", "parametrizations.weight.original1")
+        else:
+            if "parametrizations.weight.original0" in key:
+                new_key = key.replace("parametrizations.weight.original0", "weight_g")
+            if "parametrizations.weight.original1" in key:
+                new_key = key.replace("parametrizations.weight.original1", "weight_v")
         if new_key:
             old_keys.append(key)
             new_keys.append(new_key)
@@ -884,8 +893,7 @@ def _load_state_dict_into_meta_model(
     is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn")
 
     for param_name, param in state_dict.items():
-        # First part of the test is always true as load_state_dict_keys always contains state_dict keys.
-        if param_name not in loaded_state_dict_keys or param_name not in expected_keys:
+        if param_name not in expected_keys:
             continue
 
         if param_name.startswith(start_prefix):
@@ -4132,6 +4140,18 @@ def _fix_key(key):
                 return key.replace("beta", "bias")
             if "gamma" in key:
                 return key.replace("gamma", "weight")
+
+            # to avoid logging parametrized weight norm renaming
+            if hasattr(nn.utils.parametrizations, "weight_norm"):
+                if "weight_g" in key:
+                    return key.replace("weight_g", "parametrizations.weight.original0")
+                if "weight_v" in key:
+                    return key.replace("weight_v", "parametrizations.weight.original1")
+            else:
+                if "parametrizations.weight.original0" in key:
+                    return key.replace("parametrizations.weight.original0", "weight_g")
+                if "parametrizations.weight.original1" in key:
+                    return key.replace("parametrizations.weight.original1", "weight_v")
             return key
 
         original_loaded_keys = loaded_keys
@@ -4376,7 +4396,6 @@ def _find_mismatched_keys(
                 error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
                     model_to_load,
                     state_dict,
-                    loaded_keys,
                     start_prefix,
                     expected_keys,
                     device_map=device_map,
@@ -4453,7 +4472,6 @@ def _find_mismatched_keys(
                         new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
                             model_to_load,
                             state_dict,
-                            loaded_keys,
                             start_prefix,
                             expected_keys,
                             device_map=device_map,
@@ -4609,7 +4627,6 @@ def _load_pretrained_model_low_mem(
         error_msgs = _load_state_dict_into_meta_model(
             model,
             state_dict,
-            loaded_state_dict_keys,
             start_prefix,
             expected_keys=expected_keys,
             hf_quantizer=hf_quantizer,
diff --git a/src/transformers/models/dac/modeling_dac.py b/src/transformers/models/dac/modeling_dac.py
index 5211685b0231..549f98b59dda 100644
--- a/src/transformers/models/dac/modeling_dac.py
+++ b/src/transformers/models/dac/modeling_dac.py
@@ -494,33 +494,37 @@ def _init_weights(self, module):
             nn.init.constant_(module.bias, 0)
 
     def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         for layer in self.quantizer.quantizers:
-            nn.utils.weight_norm(layer.in_proj)
-            nn.utils.weight_norm(layer.out_proj)
+            weight_norm(layer.in_proj)
+            weight_norm(layer.out_proj)
 
-        nn.utils.weight_norm(self.encoder.conv1)
-        nn.utils.weight_norm(self.encoder.conv2)
+        weight_norm(self.encoder.conv1)
+        weight_norm(self.encoder.conv2)
 
         for layer in self.encoder.block:
-            nn.utils.weight_norm(layer.conv1)
-            nn.utils.weight_norm(layer.res_unit1.conv1)
-            nn.utils.weight_norm(layer.res_unit1.conv2)
-            nn.utils.weight_norm(layer.res_unit2.conv1)
-            nn.utils.weight_norm(layer.res_unit2.conv2)
-            nn.utils.weight_norm(layer.res_unit3.conv1)
-            nn.utils.weight_norm(layer.res_unit3.conv2)
+            weight_norm(layer.conv1)
+            weight_norm(layer.res_unit1.conv1)
+            weight_norm(layer.res_unit1.conv2)
+            weight_norm(layer.res_unit2.conv1)
+            weight_norm(layer.res_unit2.conv2)
+            weight_norm(layer.res_unit3.conv1)
+            weight_norm(layer.res_unit3.conv2)
 
-        nn.utils.weight_norm(self.decoder.conv1)
-        nn.utils.weight_norm(self.decoder.conv2)
+        weight_norm(self.decoder.conv1)
+        weight_norm(self.decoder.conv2)
 
         for layer in self.decoder.block:
-            nn.utils.weight_norm(layer.conv_t1)
-            nn.utils.weight_norm(layer.res_unit1.conv1)
-            nn.utils.weight_norm(layer.res_unit1.conv2)
-            nn.utils.weight_norm(layer.res_unit2.conv1)
-            nn.utils.weight_norm(layer.res_unit2.conv2)
-            nn.utils.weight_norm(layer.res_unit3.conv1)
-            nn.utils.weight_norm(layer.res_unit3.conv2)
+            weight_norm(layer.conv_t1)
+            weight_norm(layer.res_unit1.conv1)
+            weight_norm(layer.res_unit1.conv2)
+            weight_norm(layer.res_unit2.conv1)
+            weight_norm(layer.res_unit2.conv2)
+            weight_norm(layer.res_unit3.conv1)
+            weight_norm(layer.res_unit3.conv2)
 
     def remove_weight_norm(self):
         for layer in self.quantizer.quantizers:
diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py
index f325a6adbe6c..28ccb9513d63 100644
--- a/src/transformers/models/encodec/modeling_encodec.py
+++ b/src/transformers/models/encodec/modeling_encodec.py
@@ -103,8 +103,12 @@ def __init__(
             )
 
         self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, dilation=dilation)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if self.norm_type == "weight_norm":
-            self.conv = nn.utils.weight_norm(self.conv)
+            self.conv = weight_norm(self.conv)
         elif self.norm_type == "time_group_norm":
             self.norm = nn.GroupNorm(1, out_channels)
 
@@ -186,8 +190,13 @@ def __init__(self, config, in_channels: int, out_channels: int, kernel_size: int
             )
 
         self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if config.norm_type == "weight_norm":
-            self.conv = nn.utils.weight_norm(self.conv)
+            self.conv = weight_norm(self.conv)
         elif config.norm_type == "time_group_norm":
             self.norm = nn.GroupNorm(1, out_channels)
 
diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
index e97e276b18f6..1e1900d38afd 100644
--- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
@@ -1416,10 +1416,14 @@ def get_padding(self, kernel_size, dilation=1):
         return (kernel_size * dilation - dilation) // 2
 
     def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         for layer in self.convs1:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
         for layer in self.convs2:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
 
     def remove_weight_norm(self):
         for layer in self.convs1:
@@ -1493,12 +1497,16 @@ def _init_weights(self, module):
                 module.bias.data.zero_()
 
     def apply_weight_norm(self):
-        nn.utils.weight_norm(self.conv_pre)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.conv_pre)
         for layer in self.upsampler:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
         for layer in self.resblocks:
             layer.apply_weight_norm()
-        nn.utils.weight_norm(self.conv_post)
+        weight_norm(self.conv_post)
 
     def remove_weight_norm(self):
         nn.utils.remove_weight_norm(self.conv_pre)
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index a79d1d4cf2b9..ba8230ec509d 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -2361,10 +2361,14 @@ def get_padding(self, kernel_size, dilation=1):
         return (kernel_size * dilation - dilation) // 2
 
     def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         for layer in self.convs1:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
         for layer in self.convs2:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
 
     def remove_weight_norm(self):
         for layer in self.convs1:
@@ -2633,12 +2637,16 @@ def _init_weights(self, module):
                 module.weight.data[module.padding_idx].zero_()
 
     def apply_weight_norm(self):
-        nn.utils.weight_norm(self.hifi_gan.conv_pre)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.hifi_gan.conv_pre)
         for layer in self.hifi_gan.upsampler:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
         for layer in self.hifi_gan.resblocks:
             layer.apply_weight_norm()
-        nn.utils.weight_norm(self.hifi_gan.conv_post)
+        weight_norm(self.hifi_gan.conv_post)
 
     def remove_weight_norm(self):
         nn.utils.remove_weight_norm(self.hifi_gan.conv_pre)
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index a53f544bb34f..2d1fde8eed69 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -2608,10 +2608,14 @@ def get_padding(self, kernel_size, dilation=1):
         return (kernel_size * dilation - dilation) // 2
 
     def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         for layer in self.convs1:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
         for layer in self.convs2:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
 
     def remove_weight_norm(self):
         for layer in self.convs1:
@@ -2889,12 +2893,16 @@ def _init_weights(self, module):
 
     # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.apply_weight_norm
     def apply_weight_norm(self):
-        nn.utils.weight_norm(self.hifi_gan.conv_pre)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.hifi_gan.conv_pre)
         for layer in self.hifi_gan.upsampler:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
         for layer in self.hifi_gan.resblocks:
             layer.apply_weight_norm()
-        nn.utils.weight_norm(self.hifi_gan.conv_post)
+        weight_norm(self.hifi_gan.conv_post)
 
     # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.remove_weight_norm
     def remove_weight_norm(self):
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 191d7c2cd8c6..c9a3494b88b4 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -274,11 +274,15 @@ def __init__(self, config):
             stride=config.squeeze_factor,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             if hasattr(self.conv, "parametrizations"):
                 weight_g = self.conv.parametrizations.weight.original0
                 weight_v = self.conv.parametrizations.weight.original1
@@ -288,7 +292,7 @@ def __init__(self, config):
             deepspeed.zero.register_external_parameter(self, weight_v)
             deepspeed.zero.register_external_parameter(self, weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = SEWSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index a617f5e5d64e..7f3db54defc1 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -349,11 +349,15 @@ def __init__(self, config):
             stride=config.squeeze_factor,
         )
 
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
             with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
-                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
             if hasattr(self.conv, "parametrizations"):
                 weight_g = self.conv.parametrizations.weight.original0
                 weight_v = self.conv.parametrizations.weight.original1
@@ -363,7 +367,7 @@ def __init__(self, config):
             deepspeed.zero.register_external_parameter(self, weight_v)
             deepspeed.zero.register_external_parameter(self, weight_g)
         else:
-            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
 
         self.padding = SEWDSamePadLayer(config.num_conv_pos_embeddings)
         self.activation = ACT2FN[config.feat_extract_activation]
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index a69e9b56ebc5..790e6a74a471 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -3234,10 +3234,14 @@ def get_padding(self, kernel_size, dilation=1):
         return (kernel_size * dilation - dilation) // 2
 
     def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         for layer in self.convs1:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
         for layer in self.convs2:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
 
     def remove_weight_norm(self):
         for layer in self.convs1:
@@ -3310,12 +3314,16 @@ def _init_weights(self, module):
                 module.bias.data.zero_()
 
     def apply_weight_norm(self):
-        nn.utils.weight_norm(self.conv_pre)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.conv_pre)
         for layer in self.upsampler:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
         for layer in self.resblocks:
             layer.apply_weight_norm()
-        nn.utils.weight_norm(self.conv_post)
+        weight_norm(self.conv_post)
 
     def remove_weight_norm(self):
         nn.utils.remove_weight_norm(self.conv_pre)
diff --git a/src/transformers/models/univnet/modeling_univnet.py b/src/transformers/models/univnet/modeling_univnet.py
index 5b0c659c302a..a780e54538f2 100644
--- a/src/transformers/models/univnet/modeling_univnet.py
+++ b/src/transformers/models/univnet/modeling_univnet.py
@@ -87,8 +87,12 @@ def forward(self, hidden_states: torch.FloatTensor):
         return hidden_states + residual
 
     def apply_weight_norm(self):
-        nn.utils.weight_norm(self.conv1)
-        nn.utils.weight_norm(self.conv2)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.conv1)
+        weight_norm(self.conv2)
 
     def remove_weight_norm(self):
         nn.utils.remove_weight_norm(self.conv1)
@@ -197,11 +201,15 @@ def forward(self, spectrogram: torch.FloatTensor):
         return kernels, biases
 
     def apply_weight_norm(self):
-        nn.utils.weight_norm(self.input_conv)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.input_conv)
         for layer in self.resblocks:
             layer.apply_weight_norm()
-        nn.utils.weight_norm(self.kernel_conv)
-        nn.utils.weight_norm(self.bias_conv)
+        weight_norm(self.kernel_conv)
+        weight_norm(self.bias_conv)
 
     def remove_weight_norm(self):
         nn.utils.remove_weight_norm(self.input_conv)
@@ -328,7 +336,11 @@ def location_variable_convolution(
         return output_hidden_states
 
     def apply_weight_norm(self):
-        nn.utils.weight_norm(self.conv)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.conv)
 
     def remove_weight_norm(self):
         nn.utils.remove_weight_norm(self.conv)
@@ -398,7 +410,11 @@ def forward(self, hidden_states: torch.FloatTensor, spectrogram: torch.FloatTens
         return hidden_states
 
     def apply_weight_norm(self):
-        nn.utils.weight_norm(self.convt_pre)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.convt_pre)
         self.kernel_predictor.apply_weight_norm()
         for layer in self.resblocks:
             layer.apply_weight_norm()
@@ -619,10 +635,14 @@ def _init_weights(self, module):
                 module.bias.data.zero_()
 
     def apply_weight_norm(self):
-        nn.utils.weight_norm(self.conv_pre)
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.conv_pre)
         for layer in self.resblocks:
             layer.apply_weight_norm()
-        nn.utils.weight_norm(self.conv_post)
+        weight_norm(self.conv_post)
 
     def remove_weight_norm(self):
         nn.utils.remove_weight_norm(self.conv_pre)
diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py
index d8dffd4376e0..23bc8a72f8ba 100644
--- a/src/transformers/models/vits/modeling_vits.py
+++ b/src/transformers/models/vits/modeling_vits.py
@@ -461,10 +461,14 @@ def get_padding(self, kernel_size, dilation=1):
         return (kernel_size * dilation - dilation) // 2
 
     def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         for layer in self.convs1:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
         for layer in self.convs2:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
 
     def remove_weight_norm(self):
         for layer in self.convs1:
@@ -521,8 +525,12 @@ def __init__(self, config: VitsConfig):
             self.cond = nn.Conv1d(config.speaker_embedding_size, config.upsample_initial_channel, 1)
 
     def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
         for layer in self.upsampler:
-            nn.utils.weight_norm(layer)
+            weight_norm(layer)
         for layer in self.resblocks:
             layer.apply_weight_norm()
 
diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py
index 6b21c2e9f712..852f87c8f58a 100644
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@@ -420,6 +420,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 uniform_init_parms = [
+                    "conv.parametrizations.weight",
                     "conv.weight",
                     "masked_spec_embed",
                     "quantizer.weight_proj.weight",
diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py
index b2efdccdf07c..34374eb1e0e6 100644
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@@ -422,6 +422,7 @@ def test_initialization(self):
             model = model_class(config=configs_no_init)
             for name, param in model.named_parameters():
                 uniform_init_parms = [
+                    "conv.parametrizations.weight",
                     "conv.weight",
                     "masked_spec_embed",
                     "quantizer.weight_proj.weight",

From bcf8946f0acb578c534b1d33d534450d1fc88507 Mon Sep 17 00:00:00 2001
From: Insu Jang <insujang@umich.edu>
Date: Tue, 17 Sep 2024 03:33:07 -0400
Subject: [PATCH 20/67] Fix number of patch check for different vision feature
 select strategy (#32494)

* Fix number of patch check for different vision feature select strategy

* add test

---------

Co-authored-by: raushan <raushan@huggingface.co>
---
 .../models/llava_next/modeling_llava_next.py  | 13 ++++++++++--
 .../llava_next/test_modeling_llava_next.py    | 21 +++++++++++++++++++
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index ebb4da3102da..c1d1ca8c276d 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -645,7 +645,7 @@ def _merge_input_ids_with_image_features(
 
         return final_embedding, final_attention_mask, position_ids, final_labels, final_input_ids
 
-    def pack_image_features(self, image_features, image_sizes, image_newline=None):
+    def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
         """
         Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
 
@@ -654,6 +654,8 @@ def pack_image_features(self, image_features, image_sizes, image_newline=None):
                 List of image feature tensor, each contains all the visual feature of all patches.
             image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
                 Actual image size of each images (H, W).
+            vision_feature_select_strategy (`str`)
+                The feature selection strategy used to select the vision feature from the vision backbone.
             image_newline (`torch.Tensor` of shape `(embed_dim)`)
                 New line embedding vector.
         Returns:
@@ -668,8 +670,14 @@ def pack_image_features(self, image_features, image_sizes, image_newline=None):
                 base_image_feature = image_feature[0]
                 image_feature = image_feature[1:]
                 height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
-                if height * width != base_image_feature.shape[0]:
+
+                if vision_feature_select_strategy == "default":
+                    expected_num_patches = height * width
+                elif vision_feature_select_strategy == "full":
+                    expected_num_patches = height * width + 1
+                if expected_num_patches != base_image_feature.shape[0]:
                     raise ValueError("The number of patches is not consistent with the image size.")
+
                 num_patch_height, num_patch_width = get_anyres_image_grid_shape(
                     image_sizes[image_idx],
                     self.config.image_grid_pinpoints,
@@ -825,6 +833,7 @@ def forward(
             image_features, feature_lens = self.pack_image_features(
                 image_features,
                 image_sizes,
+                vision_feature_select_strategy=vision_feature_select_strategy,
                 image_newline=self.image_newline,
             )
             if legacy_processing:
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index 3120db216ea4..bd0b5a190646 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -620,3 +620,24 @@ def test_expansion_in_processing(self):
 
         # check that both inputs are handled correctly and generate the same output
         self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_full_vision_state_selection(self):
+        model = LlavaNextForConditionalGeneration.from_pretrained(
+            "llava-hf/llava-v1.6-mistral-7b-hf",
+            load_in_4bit=True,
+        )
+        # test that changing `strategy` won't error out
+        model.vision_feature_select_strategy = "full"
+
+        inputs = self.processor(self.prompt, self.image, return_tensors="pt")
+
+        # verify generation
+        output = model.generate(**inputs, max_new_tokens=30)
+        EXPECTED_DECODED_TEXT = '[INST]  \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot that displays values for multiple quantitative variables represented on axes'  # fmt: skip
+
+        self.assertEqual(
+            self.processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )

From 642256de7176cbc83dfec12c43664e8f21ce2912 Mon Sep 17 00:00:00 2001
From: Saurav Maheshkar <sauravvmaheshkar@gmail.com>
Date: Tue, 17 Sep 2024 10:36:09 +0100
Subject: [PATCH 21/67] chore: migrate coverage cfg to pyproject.toml (#32650)

chore: move coverage cfg to pyproject
---
 .coveragerc    | 12 ------------
 pyproject.toml | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 12 deletions(-)
 delete mode 100644 .coveragerc

diff --git a/.coveragerc b/.coveragerc
deleted file mode 100644
index 9a1103b8af3d..000000000000
--- a/.coveragerc
+++ /dev/null
@@ -1,12 +0,0 @@
-[run]
-source=transformers
-omit =
-    # skip convertion scripts from testing for now
-    */convert_*
-    */__main__.py
-[report]
-exclude_lines =
-    pragma: no cover
-    raise
-    except
-    register_parameter
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 9bf76d819548..bf78e0174394 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,18 @@
+[tool.coverage.run]
+source = ["transformers"]
+omit = [
+    "*/convert_*",
+    "*/__main__.py"
+]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "raise",
+    "except",
+    "register_parameter"
+]
+
 [tool.ruff]
 line-length = 119
 

From 74026b473e8748706a7a86fd20d6a275306d8ffb Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Tue, 17 Sep 2024 17:39:34 +0800
Subject: [PATCH 22/67] =?UTF-8?q?idefics2=20enable=5Finput=5Frequire=5Fgra?=
 =?UTF-8?q?ds=20not=20aligned=20with=20disable=5Finput=5Fre=E2=80=A6=20(#3?=
 =?UTF-8?q?3194)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* idefics2 enable_input_require_grads not aligned with disable_input_require_grads
make peft+idefics2 checkpoints disable fail

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>

* split test case

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>

* fix ci failure

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>

* refine test

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>

---------

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>
---
 .../models/idefics2/modeling_idefics2.py      |  8 ++++
 .../models/speecht5/test_modeling_speecht5.py | 12 ++++++
 tests/test_modeling_common.py                 | 38 +++++++++++++++++++
 3 files changed, 58 insertions(+)

diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index f57bdd27fee6..08ada424ea77 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -1256,6 +1256,10 @@ def make_inputs_require_grads(module, input, output):
             make_inputs_require_grads
         )
 
+    def disable_input_require_grads(self):
+        self._text_require_grads_hook.remove()
+        self._vision_require_grads_hook.remove()
+
     def get_input_embeddings(self):
         return self.text_model.get_input_embeddings()
 
@@ -1466,6 +1470,10 @@ def make_inputs_require_grads(module, input, output):
             make_inputs_require_grads
         )
 
+    def disable_input_require_grads(self):
+        self._text_require_grads_hook.remove()
+        self._vision_require_grads_hook.remove()
+
     def get_input_embeddings(self):
         return self.model.text_model.get_input_embeddings()
 
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index 7a8aab83272b..e13cf8dd56c3 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -239,6 +239,12 @@ def test_torchscript_output_hidden_state(self):
     def test_torchscript_simple(self):
         pass
 
+    @unittest.skip(
+        reason="Model returns None for input_embeds, check: https://github.com/huggingface/transformers/issues/33527"
+    )
+    def test_peft_gradient_checkpointing_enable_disable(self):
+        pass
+
 
 @require_torch
 class SpeechT5ForSpeechToTextTester:
@@ -1743,6 +1749,12 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
+    @unittest.skip(
+        reason="Model returns None for input_embeds, check: https://github.com/huggingface/transformers/issues/33527"
+    )
+    def test_peft_gradient_checkpointing_enable_disable(self):
+        pass
+
     # overwrite from test_modeling_common
     def _mock_init_weights(self, module):
         if hasattr(module, "weight") and module.weight is not None:
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index da0570290c57..c7af0b1c9f5b 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -403,6 +403,44 @@ def test_gradient_checkpointing_enable_disable(self):
                         m.gradient_checkpointing, f"Module {n} does not have gradient_checkpointing set to False"
                     )
 
+    def test_peft_gradient_checkpointing_enable_disable(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            if not model_class.supports_gradient_checkpointing:
+                continue
+
+            # at init model should have gradient checkpointing disabled
+            model = model_class(config)
+            self.assertFalse(model.is_gradient_checkpointing)
+
+            # check enable works
+            model._hf_peft_config_loaded = True
+            try:
+                model.gradient_checkpointing_enable()
+            except NotImplementedError:
+                continue
+
+            self.assertTrue(model.is_gradient_checkpointing)
+
+            # Loop over all modules and check that relevant modules have gradient_checkpointing set to True
+            for n, m in model.named_modules():
+                if hasattr(m, "gradient_checkpointing"):
+                    self.assertTrue(
+                        m.gradient_checkpointing, f"Module {n} does not have gradient_checkpointing set to True"
+                    )
+
+            # check disable works
+            model.gradient_checkpointing_disable()
+            self.assertFalse(model.is_gradient_checkpointing)
+
+            # Loop over all modules and check that relevant modules have gradient_checkpointing set to False
+            for n, m in model.named_modules():
+                if hasattr(m, "gradient_checkpointing"):
+                    self.assertFalse(
+                        m.gradient_checkpointing, f"Module {n} does not have gradient_checkpointing set to False"
+                    )
+
     @is_flaky(description="low likelihood of failure, reason not yet discovered")
     def test_save_load_fast_init_from_base(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From ac5a0556f14dec503b064d5802da1092e0b558ea Mon Sep 17 00:00:00 2001
From: Max Buckley <maxwbuckley@gmail.com>
Date: Tue, 17 Sep 2024 13:32:49 +0200
Subject: [PATCH 23/67] =?UTF-8?q?Update=20chameleon.md=20=E2=80=94=20fix?=
 =?UTF-8?q?=20runtime=20type=20error=20(#33494)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update chameleon.md

Fix error

RuntimeError: Input type (float) and bias type (c10::BFloat16) should be the same
---
 docs/source/en/model_doc/chameleon.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md
index 323b83813160..28ec01ad6158 100644
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@@ -78,7 +78,7 @@ url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = "What do you see in this image?<image>"
 
-inputs = processor(prompt, image, return_tensors="pt").to(model.device)
+inputs = processor(prompt, image, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
 
 # autoregressively complete prompt
 output = model.generate(**inputs, max_new_tokens=50)

From 763548427d028878f4d4d8fb6f0be57cc3915fbd Mon Sep 17 00:00:00 2001
From: Antoine Dussolle <70636860+A-Duss@users.noreply.github.com>
Date: Wed, 18 Sep 2024 00:08:05 +0900
Subject: [PATCH 24/67] Add explicit example for RAG chat templating (#33503)

* Add explicit example for RAG chat templating

* Add Tip box and reformulate

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>

---------

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
---
 docs/source/en/chat_templating.md | 67 +++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 12 deletions(-)

diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
index f65dbc016e16..543d9fa00b8b 100644
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@@ -616,22 +616,65 @@ than the JSON schemas used for tools, no helper functions are necessary.
 Here's an example of a RAG template in action:
 
 ```python
-document1 = {
-    "title": "The Moon: Our Age-Old Foe",
-    "contents": "Man has always dreamed of destroying the moon. In this essay, I shall..."
-}
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
-document2 = {
-    "title": "The Sun: Our Age-Old Friend",
-    "contents": "Although often underappreciated, the sun provides several notable benefits..."
-}
+# Load the model and tokenizer
+model_id = "CohereForAI/c4ai-command-r-v01-4bit"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
+device = model.device # Get the device the model is loaded on
 
-model_input = tokenizer.apply_chat_template(
-    messages,
-    documents=[document1, document2]
-)
+# Define conversation input
+conversation = [
+    {"role": "user", "content": "What has Man always dreamed of?"}
+]
+
+# Define documents for retrieval-based generation
+documents = [
+    {
+        "title": "The Moon: Our Age-Old Foe", 
+        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
+    },
+    {
+        "title": "The Sun: Our Age-Old Friend",
+        "text": "Although often underappreciated, the sun provides several notable benefits..."
+    }
+]
+
+# Tokenize conversation and documents using a RAG template, returning PyTorch tensors.
+input_ids = tokenizer.apply_chat_template(
+    conversation=conversation,
+    documents=documents,
+    chat_template="rag",
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt").to(device)
+
+# Generate a response 
+gen_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    )
+
+# Decode and print the generated text along with generation prompt
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
 ```
 
+<Tip>
+
+The `documents` input for retrieval-augmented generation is not widely supported, and many models have chat templates which simply ignore this input.
+
+To verify if a model supports the `documents` input, you can read its model card, or `print(tokenizer.chat_template)` to see if the `documents` key is used anywhere.
+
+One model class that does support it, though, is Cohere's [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024), through their `rag` chat template. You can see additional examples of grounded generation using this feature in their model cards.
+
+</Tip>
+
+
+
 ## Advanced: How do chat templates work?
 
 The chat template for a model is stored on the `tokenizer.chat_template` attribute. If no chat template is set, the

From 3476c19e91a882fab8c76970a8798890a3f0f299 Mon Sep 17 00:00:00 2001
From: Guillaume LEGENDRE <glegendre01@gmail.com>
Date: Tue, 17 Sep 2024 18:12:12 +0200
Subject: [PATCH 25/67] CI Build image - move runners (#33530)

* move runners

* move runners

* move runners
---
 .github/workflows/build-docker-images.yml     | 27 ++++++++++++-------
 .../build-nightly-ci-docker-images.yml        |  8 +++---
 .../workflows/build-past-ci-docker-images.yml |  6 +++--
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index df772db773e2..c21faf2d7479 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -20,7 +20,8 @@ concurrency:
 jobs:
   latest-docker:
     name: "Latest PyTorch + TensorFlow [dev]"
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
@@ -68,7 +69,8 @@ jobs:
 
   latest-torch-deepspeed-docker:
     name: "Latest PyTorch + DeepSpeed"
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
@@ -104,7 +106,8 @@ jobs:
   # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
   latest-torch-deepspeed-docker-for-push-ci-daily-build:
     name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
@@ -145,7 +148,8 @@ jobs:
     name: "Doc builder"
     # Push CI doesn't need this image
     if: inputs.image_postfix != '-push-ci'
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
@@ -180,7 +184,8 @@ jobs:
     name: "Latest PyTorch [dev]"
     # Push CI doesn't need this image
     if: inputs.image_postfix != '-push-ci'
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
@@ -215,7 +220,8 @@ jobs:
 
   latest-pytorch-amd:
     name: "Latest PyTorch (AMD) [dev]"
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       - 
         name: Set up Docker Buildx
@@ -265,7 +271,8 @@ jobs:
     name: "Latest TensorFlow [dev]"
     # Push CI doesn't need this image
     if: inputs.image_postfix != '-push-ci'
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
@@ -300,7 +307,8 @@ jobs:
 
   latest-pytorch-deepspeed-amd:
     name: "PyTorch + DeepSpeed (AMD) [dev]"
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       - 
         name: Set up Docker Buildx
@@ -350,7 +358,8 @@ jobs:
     name: "Latest Pytorch + Quantization [dev]"
      # Push CI doesn't need this image
     if: inputs.image_postfix != '-push-ci'
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml
index 0b1b7df5f8a2..4b00a6d3fae3 100644
--- a/.github/workflows/build-nightly-ci-docker-images.yml
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@@ -13,7 +13,8 @@ concurrency:
 jobs:
   latest-with-torch-nightly-docker:
     name: "Nightly PyTorch + Stable TensorFlow"
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
@@ -40,7 +41,8 @@ jobs:
 
   nightly-torch-deepspeed-docker:
     name: "Nightly PyTorch + DeepSpeed"
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
@@ -62,4 +64,4 @@ jobs:
           build-args: |
             REF=main
           push: true
-          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
\ No newline at end of file
+          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
diff --git a/.github/workflows/build-past-ci-docker-images.yml b/.github/workflows/build-past-ci-docker-images.yml
index 6ee60b8a6b60..c4f0b78986ca 100644
--- a/.github/workflows/build-past-ci-docker-images.yml
+++ b/.github/workflows/build-past-ci-docker-images.yml
@@ -16,7 +16,8 @@ jobs:
       fail-fast: false
       matrix:
         version: ["1.13", "1.12", "1.11"]
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx
@@ -60,7 +61,8 @@ jobs:
       fail-fast: false
       matrix:
         version: ["2.11", "2.10", "2.9", "2.8", "2.7", "2.6", "2.5"]
-    runs-on: [intel-cpu, 8-cpu, ci]
+    runs-on:
+      group: aws-general-8-plus
     steps:
       -
         name: Set up Docker Buildx

From 46c27577b3ec9ccab289368dde36741eafd84c75 Mon Sep 17 00:00:00 2001
From: ErezSC42 <erezshwarts145@gmail.com>
Date: Tue, 17 Sep 2024 21:29:27 +0300
Subject: [PATCH 26/67] fix to jamba config, asserting attention and expert
 offset (#33316)

* fix to jamba config, asserting attention and expert offset

* fix foramtting

* fix foramtting

* fix foramtting

* changed to error raise instead of assertion, added unittests

* fix

* changed t_ to property_

* changed t_ to property_

* quickfix

* ran code styler
---
 .../models/jamba/configuration_jamba.py       |  9 ++++
 tests/models/jamba/test_modeling_jamba.py     | 44 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/jamba/configuration_jamba.py b/src/transformers/models/jamba/configuration_jamba.py
index 58c8a685feab..b493db7ed456 100644
--- a/src/transformers/models/jamba/configuration_jamba.py
+++ b/src/transformers/models/jamba/configuration_jamba.py
@@ -193,6 +193,9 @@ def __init__(
         self.attn_layer_period = attn_layer_period
         self.attn_layer_offset = attn_layer_offset
 
+        self._check_supported_offset("attention", self.attn_layer_period, self.attn_layer_offset)
+        self._check_supported_offset("expert", self.expert_layer_period, self.expert_layer_offset)
+
         self.use_mamba_kernels = use_mamba_kernels
         self.mamba_d_state = mamba_d_state
         self.mamba_d_conv = mamba_d_conv
@@ -222,3 +225,9 @@ def layers_num_experts(self):
             self.num_experts if i % self.expert_layer_period == self.expert_layer_offset else 1
             for i in range(self.num_hidden_layers)
         ]
+
+    def _check_supported_offset(self, property_: str, period: int, offset: int):
+        if offset >= period:
+            raise ValueError(
+                f"{property_} layer offset ({offset}) must be smaller than {property_} layer period ({period})"
+            )
diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py
index 6cbfe62cfe17..6e1a2cf2cf9c 100644
--- a/tests/models/jamba/test_modeling_jamba.py
+++ b/tests/models/jamba/test_modeling_jamba.py
@@ -50,6 +50,48 @@
     )
 
 
+class JambaConfigTester(ConfigTester):
+    def _create_attn_config(self, attn_layer_offset: int, attn_layer_period: int):
+        _input_dict = self.inputs_dict.copy()
+        _input_dict["attn_layer_offset"] = attn_layer_offset
+        _input_dict["attn_layer_period"] = attn_layer_period
+        return self.config_class(**_input_dict)
+
+    def _create_expert_config(self, expert_layer_offset: int, expert_layer_period: int):
+        _input_dict = self.inputs_dict.copy()
+        _input_dict["expert_layer_offset"] = expert_layer_offset
+        _input_dict["expert_layer_period"] = expert_layer_period
+        return self.config_class(**_input_dict)
+
+    def test_attn_offsets(self):
+        self._create_attn_config(attn_layer_offset=0, attn_layer_period=4)
+        self._create_attn_config(attn_layer_offset=1, attn_layer_period=4)
+        self._create_attn_config(attn_layer_offset=2, attn_layer_period=4)
+        self._create_attn_config(attn_layer_offset=3, attn_layer_period=4)
+        with self.parent.assertRaises(ValueError):
+            self._create_attn_config(attn_layer_offset=4, attn_layer_period=4)
+        with self.parent.assertRaises(ValueError):
+            self._create_attn_config(attn_layer_offset=5, attn_layer_period=4)
+
+    def test_expert_offsets(self):
+        self._create_expert_config(expert_layer_offset=0, expert_layer_period=4)
+        self._create_expert_config(expert_layer_offset=1, expert_layer_period=4)
+        self._create_expert_config(expert_layer_offset=2, expert_layer_period=4)
+        self._create_expert_config(expert_layer_offset=3, expert_layer_period=4)
+        with self.parent.assertRaises(ValueError):
+            self._create_expert_config(expert_layer_offset=4, expert_layer_period=4)
+        with self.parent.assertRaises(ValueError):
+            self._create_expert_config(expert_layer_offset=5, expert_layer_period=4)
+
+    def test_jamba_offset_properties(self):
+        self.test_attn_offsets()
+        self.test_expert_offsets()
+
+    def run_common_tests(self):
+        self.test_jamba_offset_properties()
+        return super().run_common_tests()
+
+
 class JambaModelTester:
     def __init__(
         self,
@@ -302,7 +344,7 @@ class JambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
 
     def setUp(self):
         self.model_tester = JambaModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=JambaConfig, hidden_size=37)
+        self.config_tester = JambaConfigTester(self, config_class=JambaConfig, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()

From c29a8694b0bd35b773bf3fa87e0370bf840d2fe5 Mon Sep 17 00:00:00 2001
From: Nikita Krasnytskyi <nikita.kras.kyiv@gmail.com>
Date: Tue, 17 Sep 2024 19:36:11 +0100
Subject: [PATCH 27/67] Fix missing `sequences_scores` in the Whisper beam
 search output  (#32970)

* added sequences_scores to the output

* added beam_indices to output

* added test to check for beam_indices, sequences_scores and their shape

* removed redundant whitespaces

* make fixup
---
 .../models/whisper/generation_whisper.py      | 10 ++++++----
 tests/models/whisper/test_modeling_whisper.py | 19 +++++++++++++++++++
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 91812155c571..8012c3c1bbfc 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1000,13 +1000,15 @@ def _stack_split_outputs(self, seek_outputs, model_output_type, device, kwargs):
         # Stack back seek_outputs tensors after splitting them with the split_by_batch_index method
         outputs = {}
         for key in seek_outputs[0].keys():
-            if key == "sequences":
+            if key in ["sequences", "beam_indices"]:
                 outputs[key] = torch.stack([v[key] for v in seek_outputs], dim=0).to(device)
-            if key in ["scores", "encoder_attentions", "encoder_hidden_states", "logits"]:
+            elif key in ["scores", "encoder_attentions", "encoder_hidden_states", "logits"]:
                 outputs[key] = tuple(
                     torch.stack([v[key][i] for v in seek_outputs]).to(device) for i in range(len(seek_outputs[0][key]))
                 )
-            if key in ["decoder_attentions", "decoder_hidden_states", "cross_attentions"]:
+            elif key == "sequences_scores":
+                outputs[key] = torch.stack([v[key] for v in seek_outputs], dim=0).to(device)
+            elif key in ["decoder_attentions", "decoder_hidden_states", "cross_attentions"]:
                 outputs[key] = tuple(
                     tuple(
                         torch.stack([v[key][i][j] for v in seek_outputs]).squeeze(1).to(device)
@@ -1014,7 +1016,7 @@ def _stack_split_outputs(self, seek_outputs, model_output_type, device, kwargs):
                     )
                     for i in range(len(seek_outputs[0][key]))
                 )
-            if key == "past_key_values":
+            elif key == "past_key_values":
                 past_key_value_type = kwargs.get("past_key_values")
                 if seek_outputs[0][key] is not None:
                     outputs[key] = tuple(
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index ebc9ce5ec358..f7ac2bc12eb2 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -529,6 +529,25 @@ def test_inputs_embeds(self):
             with torch.no_grad():
                 model(**inputs)[0]
 
+    def test_beam_search_output(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        model = WhisperForConditionalGeneration(config).to(torch_device).eval()
+
+        input_features = input_dict["input_features"]
+
+        # Perform beam search
+        output = model.generate(
+            input_features, num_beams=3, num_return_sequences=3, return_dict_in_generate=True, output_scores=True
+        )
+
+        # Check if beam_indices and sequences_scores are in the output
+        self.assertIn("beam_indices", output, "beam_indices not found in the output")
+        self.assertIn("sequences_scores", output, "sequences_scores not found in the output")
+
+        # Validate the shapes of the beam_indices and sequences_scores
+        self.assertEqual(output.beam_indices.shape[0], input_features.shape[0] * 3)
+        self.assertEqual(output.sequences_scores.shape[0], input_features.shape[0] * 3)
+
     # training is not supported yet
     @unittest.skip(reason="Training is not supported yet")
     def test_training(self):

From d8500cd229c9f687c667be3e4556ffdd678aed69 Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Tue, 17 Sep 2024 14:44:27 -0400
Subject: [PATCH 28/67] Uniformize kwargs for Pixtral processor (#33521)

* add uniformized pixtral and kwargs

* update doc

* fix _validate_images_text_input_order

* nit
---
 docs/source/en/model_doc/pixtral.md           |   6 +-
 src/transformers/models/pixtral/__init__.py   |   3 +-
 .../models/pixtral/processing_pixtral.py      |  74 ++++---
 src/transformers/processing_utils.py          |  22 +-
 .../models/pixtral/test_processor_pixtral.py  | 198 ++++++++++++++++--
 tests/test_processing_common.py               |   2 +
 tests/utils/test_processing_utils.py          |  12 ++
 7 files changed, 255 insertions(+), 62 deletions(-)

diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index 8df2bf5af5f9..1c610d19b681 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -24,7 +24,7 @@ The Pixtral model was released by the Mistral AI team on [Vllm](https://github.c
 Tips:
 
 - Pixtral is a multimodal model, the main contribution is the 2d ROPE on the images, and support for arbitrary image size (the images are not padded together nor are they resized)
-- This model follows the `Llava` familiy, meaning image embeddings are placed instead of the `[IMG]` token placeholders. 
+- This model follows the `Llava` familiy, meaning image embeddings are placed instead of the `[IMG]` token placeholders.
 - The format for one or mulitple prompts is the following:
 ```
 "<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
@@ -35,7 +35,7 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts)
 
 Here is an example of how to run it:
 
-```python 
+```python
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 from PIL import Image
 
@@ -51,7 +51,7 @@ IMG_URLS = [
 ]
 PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
 
-inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
+inputs = processor(images=IMG_URLS, text=PROMPT, return_tensors="pt").to("cuda")
 generate_ids = model.generate(**inputs, max_new_tokens=500)
 ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
diff --git a/src/transformers/models/pixtral/__init__.py b/src/transformers/models/pixtral/__init__.py
index e09ed8e60127..8c32b8750b03 100644
--- a/src/transformers/models/pixtral/__init__.py
+++ b/src/transformers/models/pixtral/__init__.py
@@ -43,7 +43,8 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_pixtral import PixtralProcessor, PixtralVisionConfig
+    from .configuration_pixtral import PixtralVisionConfig
+    from .processing_pixtral import PixtralProcessor
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index 9362703c8aa6..1b07aa02771d 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -16,18 +16,36 @@
 Processor class for Pixtral.
 """
 
-from typing import List, Optional, Union
+import sys
+from typing import List, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, logging, requires_backends
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, _validate_images_text_input_order
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import is_torch_device, is_torch_dtype, is_torch_tensor, logging, requires_backends
 
 
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
 logger = logging.get_logger(__name__)
 
 
+class PixtralProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {},
+        "common_kwargs": {
+            "return_tensors": "pt",
+        },
+    }
+
+
 # Copied from transformers.models.idefics2.processing_idefics2.is_url
 def is_url(val) -> bool:
     return isinstance(val, str) and val.startswith("http")
@@ -143,12 +161,11 @@ def __init__(
 
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         images: ImageInput = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length=None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[PixtralProcessorKwargs],
     ) -> BatchMixFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -158,26 +175,13 @@ def __call__(
         of the above two methods for more information.
 
         Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
             text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
@@ -195,6 +199,15 @@ def __call__(
             `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        output_kwargs = self._merge_kwargs(
+            PixtralProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
         if images is not None:
             if is_image_or_image_url(images):
                 images = [[images]]
@@ -209,7 +222,7 @@ def __call__(
                     "Invalid input images. Please provide a single image or a list of images or a list of list of images."
                 )
             images = [[load_image(im) for im in sample] for sample in images]
-            image_inputs = self.image_processor(images, patch_size=self.patch_size, return_tensors=return_tensors)
+            image_inputs = self.image_processor(images, patch_size=self.patch_size, **output_kwargs["images_kwargs"])
         else:
             image_inputs = {}
 
@@ -246,16 +259,9 @@ def __call__(
                 while "<placeholder>" in sample:
                     replace_str = replace_strings.pop(0)
                     sample = sample.replace("<placeholder>", replace_str, 1)
-
                 prompt_strings.append(sample)
 
-        text_inputs = self.tokenizer(
-            prompt_strings,
-            return_tensors=return_tensors,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-        )
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
         return BatchMixFeature(data={**text_inputs, **image_inputs})
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index ee28c01189b4..ddf40c4fe6d4 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -27,7 +27,7 @@
 import numpy as np
 
 from .dynamic_module_utils import custom_object_save
-from .image_utils import ChannelDimension, is_vision_available, valid_images
+from .image_utils import ChannelDimension, is_valid_image, is_vision_available
 
 
 if is_vision_available():
@@ -1003,6 +1003,20 @@ def _validate_images_text_input_order(images, text):
     in the processor's `__call__` method before calling this method.
     """
 
+    def is_url(val) -> bool:
+        return isinstance(val, str) and val.startswith("http")
+
+    def _is_valid_images_input_for_processor(imgs):
+        # If we have an list of images, make sure every image is valid
+        if isinstance(imgs, (list, tuple)):
+            for img in imgs:
+                if not _is_valid_images_input_for_processor(img):
+                    return False
+        # If not a list or tuple, we have been given a single image or batched tensor of images
+        elif not (is_valid_image(imgs) or is_url(imgs)):
+            return False
+        return True
+
     def _is_valid_text_input_for_processor(t):
         if isinstance(t, str):
             # Strings are fine
@@ -1019,11 +1033,11 @@ def _is_valid_text_input_for_processor(t):
     def _is_valid(input, validator):
         return validator(input) or input is None
 
-    images_is_valid = _is_valid(images, valid_images)
-    images_is_text = _is_valid_text_input_for_processor(images) if not images_is_valid else False
+    images_is_valid = _is_valid(images, _is_valid_images_input_for_processor)
+    images_is_text = _is_valid_text_input_for_processor(images)
 
     text_is_valid = _is_valid(text, _is_valid_text_input_for_processor)
-    text_is_images = valid_images(text) if not text_is_valid else False
+    text_is_images = _is_valid_images_input_for_processor(text)
     # Handle cases where both inputs are valid
     if images_is_valid and text_is_valid:
         return images, text
diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
index b70cab1c0744..04aa3ee8a38b 100644
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -11,14 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import shutil
+import tempfile
 import unittest
 
 import requests
 import torch
 
-from transformers.testing_utils import require_vision
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+)
 from transformers.utils import is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
@@ -27,7 +34,7 @@
 
 
 @require_vision
-class PixtralProcessorTest(unittest.TestCase):
+class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = PixtralProcessor
 
     @classmethod
@@ -40,15 +47,20 @@ def setUpClass(cls):
         cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
 
     def setUp(self):
-        super().setUp()
+        self.tmpdirname = tempfile.mkdtemp()
 
         # FIXME - just load the processor directly from the checkpoint
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/pixtral-12b")
         image_processor = PixtralImageProcessor()
-        self.processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor)
+        processor.save_pretrained(self.tmpdirname)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
 
     @unittest.skip("No chat template was set for this model (yet)")
     def test_chat_template(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
         expected_prompt = "USER: [IMG]\nWhat is shown in this image? ASSISTANT:"
 
         messages = [
@@ -60,11 +72,12 @@ def test_chat_template(self):
                 ],
             },
         ]
-        formatted_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+        formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
         self.assertEqual(expected_prompt, formatted_prompt)
 
     @unittest.skip("No chat template was set for this model (yet)")
     def test_image_token_filling(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
         # Important to check with non square image
         image = torch.randint(0, 2, (3, 500, 316))
         expected_image_tokens = 1526
@@ -79,8 +92,8 @@ def test_image_token_filling(self):
                 ],
             },
         ]
-        inputs = self.processor(
-            text=[self.processor.apply_chat_template(messages)],
+        inputs = processor(
+            text=[processor.apply_chat_template(messages)],
             images=[image],
             return_tensors="pt",
         )
@@ -88,14 +101,15 @@ def test_image_token_filling(self):
         self.assertEqual(expected_image_tokens, image_tokens)
 
     def test_processor_with_single_image(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
         prompt_string = "USER: [IMG]\nWhat's the content of the image? ASSISTANT:"
 
         # Make small for checking image token expansion
-        self.processor.image_processor.size = {"longest_edge": 30}
-        self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+        processor.image_processor.size = {"longest_edge": 30}
+        processor.image_processor.patch_size = {"height": 2, "width": 2}
 
         # Test passing in an image
-        inputs_image = self.processor(text=prompt_string, images=self.image_0, return_tensors="pt")
+        inputs_image = processor(text=prompt_string, images=self.image_0, return_tensors="pt")
         self.assertIn("input_ids", inputs_image)
         self.assertTrue(len(inputs_image["input_ids"]) == 1)
         self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
@@ -115,7 +129,7 @@ def test_processor_with_single_image(self):
         # fmt: on
 
         # Test passing in a url
-        inputs_url = self.processor(text=prompt_string, images=self.url_0, return_tensors="pt")
+        inputs_url = processor(text=prompt_string, images=self.url_0, return_tensors="pt")
         self.assertIn("input_ids", inputs_url)
         self.assertTrue(len(inputs_url["input_ids"]) == 1)
         self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
@@ -135,14 +149,15 @@ def test_processor_with_single_image(self):
         # fmt: on
 
     def test_processor_with_multiple_images_single_list(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
         prompt_string = "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:"
 
         # Make small for checking image token expansion
-        self.processor.image_processor.size = {"longest_edge": 30}
-        self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+        processor.image_processor.size = {"longest_edge": 30}
+        processor.image_processor.patch_size = {"height": 2, "width": 2}
 
         # Test passing in an image
-        inputs_image = self.processor(text=prompt_string, images=[self.image_0, self.image_1], return_tensors="pt")
+        inputs_image = processor(text=prompt_string, images=[self.image_0, self.image_1], return_tensors="pt")
         self.assertIn("input_ids", inputs_image)
         self.assertTrue(len(inputs_image["input_ids"]) == 1)
         self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
@@ -162,7 +177,7 @@ def test_processor_with_multiple_images_single_list(self):
         # fmt: on
 
         # Test passing in a url
-        inputs_url = self.processor(text=prompt_string, images=[self.url_0, self.url_1], return_tensors="pt")
+        inputs_url = processor(text=prompt_string, images=[self.url_0, self.url_1], return_tensors="pt")
         self.assertIn("input_ids", inputs_url)
         self.assertTrue(len(inputs_url["input_ids"]) == 1)
         self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
@@ -181,19 +196,20 @@ def test_processor_with_multiple_images_single_list(self):
         # fmt: on
 
     def test_processor_with_multiple_images_multiple_lists(self):
+        processor = self.processor_class.from_pretrained(self.tmpdirname)
         prompt_string = [
             "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:",
             "USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
         ]
-        self.processor.tokenizer.pad_token = "</s>"
+        processor.tokenizer.pad_token = "</s>"
         image_inputs = [[self.image_0, self.image_1], [self.image_2]]
 
         # Make small for checking image token expansion
-        self.processor.image_processor.size = {"longest_edge": 30}
-        self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+        processor.image_processor.size = {"longest_edge": 30}
+        processor.image_processor.patch_size = {"height": 2, "width": 2}
 
         # Test passing in an image
-        inputs_image = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+        inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
         self.assertIn("input_ids", inputs_image)
         self.assertTrue(len(inputs_image["input_ids"]) == 2)
         self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
@@ -213,7 +229,7 @@ def test_processor_with_multiple_images_multiple_lists(self):
         # fmt: on
 
         # Test passing in a url
-        inputs_url = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+        inputs_url = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
         self.assertIn("input_ids", inputs_url)
         self.assertTrue(len(inputs_url["input_ids"]) == 2)
         self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
@@ -231,3 +247,145 @@ def test_processor_with_multiple_images_multiple_lists(self):
             [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
         )
         # fmt: on
+
+    # Override all tests requiring shape as returning tensor batches is not supported by PixtralProcessor
+
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", size={"height": 240, "width": 240})
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        # Added dimension by pixtral image processor
+        self.assertEqual(len(inputs["pixel_values"][0][0][0][0]), 240)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", size={"height": 400, "width": 400})
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, size={"height": 240, "width": 240})
+        self.assertEqual(len(inputs["pixel_values"][0][0][0][0]), 240)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"size": {"height": 240, "width": 240}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"size": {"height": 240, "width": 240}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            size={"height": 240, "width": 240},
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        # images needs to be nested to detect multiple prompts
+        image_input = [self.prepare_image_inputs()] * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            size={"height": 240, "width": 240},
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240)
+        self.assertEqual(len(inputs["input_ids"][0]), 4)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index b8ca7a6d6733..f3111f5d5785 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -64,6 +64,8 @@ def get_component(self, attribute, **kwargs):
         component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
         if attribute == "tokenizer" and not component.pad_token:
             component.pad_token = "[TEST_PAD]"
+            if component.pad_token_id is None:
+                component.pad_token_id = 0
 
         return component
 
diff --git a/tests/utils/test_processing_utils.py b/tests/utils/test_processing_utils.py
index cf0e66cd7bd0..f669da25385f 100644
--- a/tests/utils/test_processing_utils.py
+++ b/tests/utils/test_processing_utils.py
@@ -80,6 +80,18 @@ def test_validate_images_text_input_order(self):
         self.assertTrue(np.array_equal(valid_images[0], images[0]))
         self.assertEqual(valid_text, text)
 
+        # list of strings and list of url images inputs
+        images = ["https://url1", "https://url2"]
+        text = ["text1", "text2"]
+        # test correct text and images order
+        valid_images, valid_text = _validate_images_text_input_order(images=images, text=text)
+        self.assertEqual(valid_images, images)
+        self.assertEqual(valid_text, text)
+        # test incorrect text and images order
+        valid_images, valid_text = _validate_images_text_input_order(images=text, text=images)
+        self.assertEqual(valid_images, images)
+        self.assertEqual(valid_text, text)
+
         # list of strings and nested list of numpy images inputs
         images = [[np.random.rand(224, 224, 3), np.random.rand(224, 224, 3)], [np.random.rand(224, 224, 3)]]
         text = ["text1", "text2"]

From 6c051b4e1ebd4464e6cb50bc320fd7a873151faf Mon Sep 17 00:00:00 2001
From: teamclouday <teamclouday@gmail.com>
Date: Tue, 17 Sep 2024 17:11:32 -0400
Subject: [PATCH 29/67] Add revision to trainer push_to_hub (#33482)

* add revision to trainer push_to_hub

* apply suggestions

* add test for revision

* apply ruff format

* reorganize imports

* change test trainer path
---
 src/transformers/trainer.py   |  4 ++++
 tests/trainer/test_trainer.py | 21 ++++++++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 97a052093652..e0a49ee5795e 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -4461,6 +4461,7 @@ def push_to_hub(
         commit_message: Optional[str] = "End of training",
         blocking: bool = True,
         token: Optional[str] = None,
+        revision: Optional[str] = None,
         **kwargs,
     ) -> str:
         """
@@ -4473,6 +4474,8 @@ def push_to_hub(
                 Whether the function should return only when the `git push` has finished.
             token (`str`, *optional*, defaults to `None`):
                 Token with write permission to overwrite Trainer's original args.
+            revision (`str`, *optional*):
+                The git revision to commit from. Defaults to the head of the "main" branch.
             kwargs (`Dict[str, Any]`, *optional*):
                 Additional keyword arguments passed along to [`~Trainer.create_model_card`].
 
@@ -4526,6 +4529,7 @@ def push_to_hub(
             token=token,
             run_as_future=not blocking,
             ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"],
+            revision=revision,
         )
 
     #
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 791486ec8374..c5f8b6169fcf 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -31,7 +31,7 @@
 from unittest.mock import Mock, patch
 
 import numpy as np
-from huggingface_hub import HfFolder, ModelCard, delete_repo, list_repo_commits, list_repo_files
+from huggingface_hub import HfFolder, ModelCard, create_branch, delete_repo, list_repo_commits, list_repo_files
 from parameterized import parameterized
 from requests.exceptions import HTTPError
 
@@ -3933,6 +3933,25 @@ def test_push_to_hub_tags(self):
             model_card = ModelCard.load(repo_name)
             self.assertTrue("test-trainer-tags" in model_card.data.tags)
 
+    def test_push_to_hub_with_revision(self):
+        # Checks if `trainer.push_to_hub()` works correctly by adding revision
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            trainer = get_regression_trainer(
+                output_dir=os.path.join(tmp_dir, "test-trainer-revision"),
+                push_to_hub=True,
+                hub_token=self._token,
+            )
+            branch = "v1.0"
+            create_branch(repo_id=trainer.hub_model_id, branch=branch, token=self._token, exist_ok=True)
+            url = trainer.push_to_hub(revision=branch)
+
+            # Extract branch from the url
+            re_search = re.search(r"tree/([^/]+)/", url)
+            self.assertIsNotNone(re_search)
+
+            branch_name = re_search.groups()[0]
+            self.assertEqual(branch_name, branch)
+
 
 @require_torch
 @require_optuna

From 454a0f2efdf9f0d94b77ef08efabbdc6418c868d Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Wed, 18 Sep 2024 05:24:42 +0800
Subject: [PATCH 30/67] =?UTF-8?q?fix=20patch=5Fattention=5Fmask=20incorrec?=
 =?UTF-8?q?t=20setting=20which=20leads=20to=20the=20differe=E2=80=A6=20(#3?=
 =?UTF-8?q?3499)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix patch_attention_mask incorrect setting which leads to the difference in the generated text if batch > 1

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>

* fix format

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>

* [run_slow] idefics2

---------

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>
---
 .../models/idefics2/modeling_idefics2.py      |  2 +-
 .../models/idefics2/test_modeling_idefics2.py | 35 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index 08ada424ea77..6108f0e8a42e 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -1388,7 +1388,7 @@ def forward(
             patch_size = self.config.vision_config.patch_size
             patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
             patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
-            patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+            patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) == patch_size * patch_size).bool()
 
             # Get sequence from the vision encoder
             image_hidden_states = self.vision_model(
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index 8d48acb9500d..e02c5b4c9f09 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -540,6 +540,41 @@ def test_integration_test_4bit(self):
         expected_generated_text = "In this image, we see the Statue of Liberty, the Hudson River,"
         self.assertEqual(generated_texts[0], expected_generated_text)
 
+    @slow
+    @require_bitsandbytes
+    def test_integration_test_4bit_batch2(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+
+        model = Idefics2ForConditionalGeneration.from_pretrained(
+            "HuggingFaceM4/idefics2-8b-base",
+            load_in_4bit=True,
+        )
+
+        from datasets import load_dataset
+
+        dataset = load_dataset("nielsr/docvqa_1200_examples", split="test")
+
+        text = [f"<image>{dataset[40]['query']['en']}", f"<image>{dataset[41]['query']['en']}"]
+        images = [[dataset[40]["image"]], [dataset[41]["image"]]]
+        inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt")
+        generated_ids = model.generate(**inputs, max_new_tokens=64)
+        batched_generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        text = f"<image>{dataset[40]['query']['en']}"
+        images = dataset[40]["image"]
+        inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt")
+        generated_ids = model.generate(**inputs, max_new_tokens=64)
+        generated_text_0 = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        text = f"<image>{dataset[41]['query']['en']}"
+        images = dataset[41]["image"]
+        inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt")
+        generated_ids = model.generate(**inputs, max_new_tokens=64)
+        generated_text_1 = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        self.assertEqual(batched_generated_texts[0], generated_text_0[0])
+        self.assertEqual(batched_generated_texts[1], generated_text_1[0])
+
     @require_flash_attn
     @require_torch_gpu
     @require_bitsandbytes

From fee86516a48c92133847fc7b44ca2f83c7c5634d Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Wed, 18 Sep 2024 09:21:55 +0200
Subject: [PATCH 31/67] Support LLaVa-OV-Chat (#33532)

* add llava-ov-chat

* uncomment
---
 .../convert_llava_onevision_weights_to_hf.py  | 32 +++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
index e8d51f99e67f..65c57f624f54 100644
--- a/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
+++ b/src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py
@@ -105,9 +105,17 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
 
     if model_id in ["lmms-lab/llava-onevision-qwen2-0.5b-ov", "lmms-lab/llava-onevision-qwen2-0.5b-si"]:
         text_model_id = "Qwen/Qwen2-0.5B-Instruct"
-    elif model_id in ["lmms-lab/llava-onevision-qwen2-7b-ov", "lmms-lab/llava-onevision-qwen2-7b-si"]:
+    elif model_id in [
+        "lmms-lab/llava-onevision-qwen2-7b-ov",
+        "lmms-lab/llava-onevision-qwen2-7b-si",
+        "lmms-lab/llava-onevision-qwen2-7b-ov-chat",
+    ]:
         text_model_id = "Qwen/Qwen2-7B-Instruct"
-    elif model_id in ["lmms-lab/llava-onevision-qwen2-72b-ov", "lmms-lab/llava-onevision-qwen2-72b-si"]:
+    elif model_id in [
+        "lmms-lab/llava-onevision-qwen2-72b-ov",
+        "lmms-lab/llava-onevision-qwen2-72b-si",
+        "lmms-lab/llava-onevision-qwen2-72b-ov-chat",
+    ]:
         text_model_id = "Qwen/Qwen2-72B-Instruct"
 
     vision_model_id = data["mm_vision_tower"]
@@ -260,6 +268,20 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
                 dtype=torch.float32,
                 device=device,
             )
+        elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov-chat":
+            # Not yet checked against reference
+            expected_slice = torch.tensor(
+                [[1.8662, 3.4316, 1.3174], [2.7109, 2.5488, 3.0117], [4.4648, 4.9648, 10.3359]],
+                dtype=torch.float32,
+                device=device,
+            )
+        elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov-chat":
+            # Not yet checked against reference
+            expected_slice = torch.tensor(
+                [[4.3086, 4.7344, 2.6953], [1.7090, 5.1719, 4.0234], [1.3057, 6.3438, 9.5469]],
+                dtype=torch.float32,
+                device=device,
+            )
         else:
             raise ValueError(f"Model {model_id} not supported")
 
@@ -289,6 +311,10 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
         expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. The chart is used to compare the performance of different models or systems across various benchmarks or metrics.\n\nIn this specific radar chart, there are multiple axes, each representing a different benchmark or metric, such as VQA2, GQA, TextVQA, and others. The chart includes several colored lines"
     elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov":
         expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart comparing the performance of different models on various multimodal benchmarks. The models compared are BLIP-2, InstructBLIP, POPE, QWen-VL-Chat, and LLava-1.5. The benchmarks include VQAv2, GQA, TextVQA, SQA-IMG, VizWiz, MM-IMDb, MM-VQA, MM-IMDb-CN, MM-IMDb-EN, MM-"
+    elif model_id == "lmms-lab/llava-onevision-qwen2-7b-ov-chat":
+        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart, also known as a spider chart or a star chart, which is used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along these axes.\n\nIn this particular radar chart, there are multiple lines representing different models or systems, each distinguished by a different color and labeled with a name such as BLIP-2, In"
+    elif model_id == "lmms-lab/llava-onevision-qwen2-72b-ov-chat":
+        expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image is a radar chart comparing the performance of different models on various multimodal benchmarks. The models compared are BLIP-2, InstructBLIP, POPE, QWen-VL-Chat, and LLava-1.5. The benchmarks include VQAv2, GQA, TextVQA, SQA-IMG, VizWiz, MM-IMDb, MM-VQA, MM-IMDb-CN, MM-IMDb-EN, MM-"
     else:
         raise ValueError(f"Model {model_id} not supported")
 
@@ -346,6 +372,8 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False):
             "lmms-lab/llava-onevision-qwen2-7b-ov",
             "lmms-lab/llava-onevision-qwen2-72b-si",
             "lmms-lab/llava-onevision-qwen2-72b-ov",
+            "lmms-lab/llava-onevision-qwen2-7b-ov-chat",
+            "lmms-lab/llava-onevision-qwen2-72b-ov-chat",
         ],
         required=False,
     )

From e6d9f39dd7551d1a95be081cbb59f94c54c3dbf6 Mon Sep 17 00:00:00 2001
From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com>
Date: Wed, 18 Sep 2024 11:07:51 +0200
Subject: [PATCH 32/67] Decorator for easier tool building (#33439)

* Decorator for tool building
---
 docs/source/en/agents.md                      | 76 +++++----------
 docs/source/en/agents_advanced.md             | 63 ++++++++++++-
 docs/source/en/main_classes/agent.md          |  4 +
 src/transformers/__init__.py                  |  2 +
 src/transformers/agents/__init__.py           |  4 +-
 src/transformers/agents/agent_types.py        |  2 +-
 src/transformers/agents/agents.py             | 18 ++--
 src/transformers/agents/default_tools.py      |  9 +-
 .../agents/document_question_answering.py     |  6 +-
 .../agents/image_question_answering.py        |  4 +-
 src/transformers/agents/prompts.py            |  2 +-
 src/transformers/agents/search.py             |  6 +-
 src/transformers/agents/speech_to_text.py     |  2 +-
 src/transformers/agents/text_to_speech.py     |  2 +-
 src/transformers/agents/tools.py              | 92 ++++++++++++++++---
 src/transformers/agents/translation.py        |  8 +-
 src/transformers/utils/chat_template_utils.py | 10 +-
 tests/agents/test_agents.py                   |  4 +-
 tests/agents/test_final_answer.py             |  8 +-
 tests/agents/test_python_interpreter.py       | 10 +-
 tests/agents/test_tools_common.py             | 75 ++++++++++++++-
 21 files changed, 294 insertions(+), 113 deletions(-)

diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md
index 0b889f4eec86..ac06c04d9baa 100644
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@@ -325,62 +325,37 @@ model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
 print(model.id)
 ```
 
-This code can be converted into a class that inherits from the [`Tool`] superclass.
+This code can quickly be converted into a tool, just by wrapping it in a function and adding the `tool` decorator:
 
 
-The custom tool needs:
-- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name is `model_download_counter`.
-- An attribute `description` is used to populate the agent's system prompt.
-- An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input.
-- An `output_type` attribute, which specifies the output type.
-- A `forward` method which contains the inference code to be executed.
-
-
-```python
-from transformers import Tool
-from huggingface_hub import list_models
-
-class HFModelDownloadsTool(Tool):
-    name = "model_download_counter"
-    description = (
-        "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. "
-        "It returns the name of the checkpoint."
-    )
-
-    inputs = {
-        "task": {
-            "type": "text",
-            "description": "the task category (such as text-classification, depth-estimation, etc)",
-        }
-    }
-    output_type = "text"
-
-    def forward(self, task: str):
-        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
-        return model.id
-```
-
-Now that the custom `HfModelDownloadsTool` class is ready, you can save it to a file named `model_downloads.py` and import it for use.
-
-
-```python
-from model_downloads import HFModelDownloadsTool
-
-tool = HFModelDownloadsTool()
-```
-
-You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access.
-
-```python
-tool.push_to_hub("{your_username}/hf-model-downloads")
+```py
+from transformers import tool
+
+@tool
+def model_download_counter(task: str) -> str:
+    """
+    This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
+    It returns the name of the checkpoint.
+
+    Args:
+        task: The task for which
+    """
+    model = next(iter(list_models(filter="text-classification", sort="downloads", direction=-1)))
+    return model.id
 ```
 
-Load the tool with the [`~Tool.load_tool`] function and pass it to the `tools` parameter in your agent.
+The function needs:
+- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_counter`.
+- Type hints on both inputs and output
+- A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint).
+All these will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible!
 
-```python
-from transformers import load_tool, CodeAgent
+> [!TIP]
+> This definition format is the same as tool schemas used in `apply_chat_template`, the only difference is the added `tool` decorator: read more on our tool use API [here](https://huggingface.co/blog/unified-tool-use#passing-tools-to-a-chat-template).
 
-model_download_tool = load_tool("m-ric/hf-model-downloads")
+Then you can directly initialize your agent:
+```py
+from transformers import CodeAgent
 agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine)
 agent.run(
     "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
@@ -400,7 +375,6 @@ print(f"The most downloaded model for the 'text-to-video' task is {most_download
 And the output:
 `"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."`
 
-
 ### Manage your agent's toolbox
 
 If you have already initialized an agent, it is inconvenient to reinitialize it from scratch with a tool you want to use. With Transformers, you can manage an agent's toolbox by adding or replacing a tool.
diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md
index 399eeb9b70eb..2327357525d8 100644
--- a/docs/source/en/agents_advanced.md
+++ b/docs/source/en/agents_advanced.md
@@ -60,7 +60,68 @@ manager_agent.run("Who is the CEO of Hugging Face?")
 > For an in-depth example of an efficient multi-agent implementation, see [how we pushed our multi-agent system to the top of the GAIA leaderboard](https://huggingface.co/blog/beating-gaia).
 
 
-## Use tools from gradio or LangChain
+## Advanced tool usage
+
+### Directly define a tool by subclassing Tool, and share it to the Hub
+
+Let's take again the tool example from main documentation, for which we had implemented a `tool` decorator.
+
+If you need to add variation, like custom attributes for your too, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass.
+
+The custom tool needs:
+- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name is `model_download_counter`.
+- An attribute `description` is used to populate the agent's system prompt.
+- An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input.
+- An `output_type` attribute, which specifies the output type.
+- A `forward` method which contains the inference code to be executed.
+
+The types for both `inputs` and `output_type` should be amongst [Pydantic formats](https://docs.pydantic.dev/latest/concepts/json_schema/#generating-json-schema).
+
+```python
+from transformers import Tool
+from huggingface_hub import list_models
+
+class HFModelDownloadsTool(Tool):
+    name = "model_download_counter"
+    description = """
+    This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
+    It returns the name of the checkpoint."""
+
+    inputs = {
+        "task": {
+            "type": "string",
+            "description": "the task category (such as text-classification, depth-estimation, etc)",
+        }
+    }
+    output_type = "string"
+
+    def forward(self, task: str):
+        model = next(iter(list_models(filter=task, sort="downloads", direction=-1)))
+        return model.id
+```
+
+Now that the custom `HfModelDownloadsTool` class is ready, you can save it to a file named `model_downloads.py` and import it for use.
+
+
+```python
+from model_downloads import HFModelDownloadsTool
+
+tool = HFModelDownloadsTool()
+```
+
+You can also share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access.
+
+```python
+tool.push_to_hub("{your_username}/hf-model-downloads")
+```
+
+Load the tool with the [`~Tool.load_tool`] function and pass it to the `tools` parameter in your agent.
+
+```python
+from transformers import load_tool, CodeAgent
+
+model_download_tool = load_tool("m-ric/hf-model-downloads")
+```
 
 ### Use gradio-tools
 
diff --git a/docs/source/en/main_classes/agent.md b/docs/source/en/main_classes/agent.md
index 8628785815ce..ed0486b60128 100644
--- a/docs/source/en/main_classes/agent.md
+++ b/docs/source/en/main_classes/agent.md
@@ -60,6 +60,10 @@ We provide two types of agents, based on the main [`Agent`] class:
 
 [[autodoc]] load_tool
 
+### tool
+
+[[autodoc]] tool
+
 ### Tool
 
 [[autodoc]] Tool
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 36775d8454ab..bfd0d37916b5 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -70,6 +70,7 @@
         "launch_gradio_demo",
         "load_tool",
         "stream_to_gradio",
+        "tool",
     ],
     "audio_utils": [],
     "benchmark": [],
@@ -4819,6 +4820,7 @@
         launch_gradio_demo,
         load_tool,
         stream_to_gradio,
+        tool,
     )
     from .configuration_utils import PretrainedConfig
 
diff --git a/src/transformers/agents/__init__.py b/src/transformers/agents/__init__.py
index d053e385cf7a..70762c252a83 100644
--- a/src/transformers/agents/__init__.py
+++ b/src/transformers/agents/__init__.py
@@ -27,7 +27,7 @@
     "agents": ["Agent", "CodeAgent", "ManagedAgent", "ReactAgent", "ReactCodeAgent", "ReactJsonAgent", "Toolbox"],
     "llm_engine": ["HfApiEngine", "TransformersEngine"],
     "monitoring": ["stream_to_gradio"],
-    "tools": ["PipelineTool", "Tool", "ToolCollection", "launch_gradio_demo", "load_tool"],
+    "tools": ["PipelineTool", "Tool", "ToolCollection", "launch_gradio_demo", "load_tool", "tool"],
 }
 
 try:
@@ -48,7 +48,7 @@
     from .agents import Agent, CodeAgent, ManagedAgent, ReactAgent, ReactCodeAgent, ReactJsonAgent, Toolbox
     from .llm_engine import HfApiEngine, TransformersEngine
     from .monitoring import stream_to_gradio
-    from .tools import PipelineTool, Tool, ToolCollection, launch_gradio_demo, load_tool
+    from .tools import PipelineTool, Tool, ToolCollection, launch_gradio_demo, load_tool, tool
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/agents/agent_types.py b/src/transformers/agents/agent_types.py
index 4a36eaaee051..f5be7462657c 100644
--- a/src/transformers/agents/agent_types.py
+++ b/src/transformers/agents/agent_types.py
@@ -234,7 +234,7 @@ def to_string(self):
             return self._path
 
 
-AGENT_TYPE_MAPPING = {"text": AgentText, "image": AgentImage, "audio": AgentAudio}
+AGENT_TYPE_MAPPING = {"string": AgentText, "image": AgentImage, "audio": AgentAudio}
 INSTANCE_TYPE_MAPPING = {str: AgentText, ImageType: AgentImage}
 
 if is_torch_available():
diff --git a/src/transformers/agents/agents.py b/src/transformers/agents/agents.py
index 5a4aea28d970..73b7186d25a3 100644
--- a/src/transformers/agents/agents.py
+++ b/src/transformers/agents/agents.py
@@ -22,7 +22,7 @@
 from .. import is_torch_available
 from ..utils import logging as transformers_logging
 from ..utils.import_utils import is_pygments_available
-from .agent_types import AgentAudio, AgentImage, AgentText
+from .agent_types import AgentAudio, AgentImage
 from .default_tools import BASE_PYTHON_TOOLS, FinalAnswerTool, setup_default_tools
 from .llm_engine import HfApiEngine, MessageRole
 from .prompts import (
@@ -626,10 +626,9 @@ def run(self, task: str, return_generated_code: bool = False, **kwargs):
         Example:
 
         ```py
-        from transformers.agents import CodeAgent, PythonInterpreterTool
+        from transformers.agents import CodeAgent
 
-        python_interpreter = PythonInterpreterTool()
-        agent = CodeAgent(tools=[python_interpreter])
+        agent = CodeAgent(tools=[])
         agent.run("What is the result of 2 power 3.7384?")
         ```
         """
@@ -1019,20 +1018,17 @@ def step(self):
                 arguments = {}
             observation = self.execute_tool_call(tool_name, arguments)
             observation_type = type(observation)
-            if observation_type == AgentText:
-                updated_information = str(observation).strip()
-            else:
-                # TODO: observation naming could allow for different names of same type
+            if observation_type in [AgentImage, AgentAudio]:
                 if observation_type == AgentImage:
                     observation_name = "image.png"
                 elif observation_type == AgentAudio:
                     observation_name = "audio.mp3"
-                else:
-                    observation_name = "object.object"
+                # TODO: observation naming could allow for different names of same type
 
                 self.state[observation_name] = observation
                 updated_information = f"Stored '{observation_name}' in memory."
-
+            else:
+                updated_information = str(observation).strip()
             self.logger.info(updated_information)
             current_step_logs["observation"] = updated_information
             return current_step_logs
diff --git a/src/transformers/agents/default_tools.py b/src/transformers/agents/default_tools.py
index b02b12d5287c..3946aa9f8735 100644
--- a/src/transformers/agents/default_tools.py
+++ b/src/transformers/agents/default_tools.py
@@ -152,8 +152,7 @@ class PythonInterpreterTool(Tool):
     name = "python_interpreter"
     description = "This is a tool that evaluates python code. It can be used to perform calculations."
 
-    output_type = "text"
-    available_tools = BASE_PYTHON_TOOLS.copy()
+    output_type = "string"
 
     def __init__(self, *args, authorized_imports=None, **kwargs):
         if authorized_imports is None:
@@ -162,7 +161,7 @@ def __init__(self, *args, authorized_imports=None, **kwargs):
             self.authorized_imports = list(set(LIST_SAFE_MODULES) | set(authorized_imports))
         self.inputs = {
             "code": {
-                "type": "text",
+                "type": "string",
                 "description": (
                     "The code snippet to evaluate. All variables used in this snippet must be defined in this same snippet, "
                     f"else you will get an error. This code can only import the following python libraries: {authorized_imports}."
@@ -173,7 +172,7 @@ def __init__(self, *args, authorized_imports=None, **kwargs):
 
     def forward(self, code):
         output = str(
-            evaluate_python_code(code, static_tools=self.available_tools, authorized_imports=self.authorized_imports)
+            evaluate_python_code(code, static_tools=BASE_PYTHON_TOOLS, authorized_imports=self.authorized_imports)
         )
         return output
 
@@ -181,7 +180,7 @@ def forward(self, code):
 class FinalAnswerTool(Tool):
     name = "final_answer"
     description = "Provides a final answer to the given problem."
-    inputs = {"answer": {"type": "text", "description": "The final answer to the problem"}}
+    inputs = {"answer": {"type": "any", "description": "The final answer to the problem"}}
     output_type = "any"
 
     def forward(self, answer):
diff --git a/src/transformers/agents/document_question_answering.py b/src/transformers/agents/document_question_answering.py
index 030120ac6c7f..23ae5b042912 100644
--- a/src/transformers/agents/document_question_answering.py
+++ b/src/transformers/agents/document_question_answering.py
@@ -31,7 +31,7 @@
 
 class DocumentQuestionAnsweringTool(PipelineTool):
     default_checkpoint = "naver-clova-ix/donut-base-finetuned-docvqa"
-    description = "This is a tool that answers a question about an document (pdf). It returns a text that contains the answer to the question."
+    description = "This is a tool that answers a question about an document (pdf). It returns a string that contains the answer to the question."
     name = "document_qa"
     pre_processor_class = AutoProcessor
     model_class = VisionEncoderDecoderModel
@@ -41,9 +41,9 @@ class DocumentQuestionAnsweringTool(PipelineTool):
             "type": "image",
             "description": "The image containing the information. Can be a PIL Image or a string path to the image.",
         },
-        "question": {"type": "text", "description": "The question in English"},
+        "question": {"type": "string", "description": "The question in English"},
     }
-    output_type = "text"
+    output_type = "string"
 
     def __init__(self, *args, **kwargs):
         if not is_vision_available():
diff --git a/src/transformers/agents/image_question_answering.py b/src/transformers/agents/image_question_answering.py
index 020d22c47f91..de0efb7b6f38 100644
--- a/src/transformers/agents/image_question_answering.py
+++ b/src/transformers/agents/image_question_answering.py
@@ -38,9 +38,9 @@ class ImageQuestionAnsweringTool(PipelineTool):
             "type": "image",
             "description": "The image containing the information. Can be a PIL Image or a string path to the image.",
         },
-        "question": {"type": "text", "description": "The question in English"},
+        "question": {"type": "string", "description": "The question in English"},
     }
-    output_type = "text"
+    output_type = "string"
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
diff --git a/src/transformers/agents/prompts.py b/src/transformers/agents/prompts.py
index de8ad1d28490..7a84b1db44fa 100644
--- a/src/transformers/agents/prompts.py
+++ b/src/transformers/agents/prompts.py
@@ -199,7 +199,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"):
 Action:
 {
   "action": "image_generator",
-  "action_input": {"text": ""A portrait of John Doe, a 55-year-old man living in Canada.""}
+  "action_input": {"prompt": "A portrait of John Doe, a 55-year-old man living in Canada."}
 }<end_action>
 Observation: "image.png"
 
diff --git a/src/transformers/agents/search.py b/src/transformers/agents/search.py
index 5ce36bf99b56..f50a7c6ab8f9 100644
--- a/src/transformers/agents/search.py
+++ b/src/transformers/agents/search.py
@@ -26,7 +26,7 @@ class DuckDuckGoSearchTool(Tool):
     name = "web_search"
     description = """Perform a web search based on your query (think a Google search) then returns the top search results as a list of dict elements.
     Each result has keys 'title', 'href' and 'body'."""
-    inputs = {"query": {"type": "text", "description": "The search query to perform."}}
+    inputs = {"query": {"type": "string", "description": "The search query to perform."}}
     output_type = "any"
 
     def forward(self, query: str) -> str:
@@ -45,11 +45,11 @@ class VisitWebpageTool(Tool):
     description = "Visits a wbepage at the given url and returns its content as a markdown string."
     inputs = {
         "url": {
-            "type": "text",
+            "type": "string",
             "description": "The url of the webpage to visit.",
         }
     }
-    output_type = "text"
+    output_type = "string"
 
     def forward(self, url: str) -> str:
         try:
diff --git a/src/transformers/agents/speech_to_text.py b/src/transformers/agents/speech_to_text.py
index 817b6319e6b8..8061651a0864 100644
--- a/src/transformers/agents/speech_to_text.py
+++ b/src/transformers/agents/speech_to_text.py
@@ -27,7 +27,7 @@ class SpeechToTextTool(PipelineTool):
     model_class = WhisperForConditionalGeneration
 
     inputs = {"audio": {"type": "audio", "description": "The audio to transcribe"}}
-    output_type = "text"
+    output_type = "string"
 
     def encode(self, audio):
         return self.pre_processor(audio, return_tensors="pt")
diff --git a/src/transformers/agents/text_to_speech.py b/src/transformers/agents/text_to_speech.py
index 3166fab8023c..ed41ef6017ae 100644
--- a/src/transformers/agents/text_to_speech.py
+++ b/src/transformers/agents/text_to_speech.py
@@ -36,7 +36,7 @@ class TextToSpeechTool(PipelineTool):
     model_class = SpeechT5ForTextToSpeech
     post_processor_class = SpeechT5HifiGan
 
-    inputs = {"text": {"type": "text", "description": "The text to read out loud (in English)"}}
+    inputs = {"text": {"type": "string", "description": "The text to read out loud (in English)"}}
     output_type = "audio"
 
     def setup(self):
diff --git a/src/transformers/agents/tools.py b/src/transformers/agents/tools.py
index f97ccc2e10ce..cfb1e4cf95ce 100644
--- a/src/transformers/agents/tools.py
+++ b/src/transformers/agents/tools.py
@@ -16,12 +16,13 @@
 # limitations under the License.
 import base64
 import importlib
+import inspect
 import io
 import json
 import os
 import tempfile
-from functools import lru_cache
-from typing import Any, Dict, List, Optional, Union
+from functools import lru_cache, wraps
+from typing import Any, Callable, Dict, List, Optional, Union
 
 from huggingface_hub import create_repo, get_collection, hf_hub_download, metadata_update, upload_folder
 from huggingface_hub.utils import RepositoryNotFoundError, build_hf_headers, get_session
@@ -35,7 +36,9 @@
 from ..models.auto import AutoProcessor
 from ..utils import (
     CONFIG_NAME,
+    TypeHintParsingException,
     cached_file,
+    get_json_schema,
     is_accelerate_available,
     is_torch_available,
     is_vision_available,
@@ -84,6 +87,20 @@ def get_repo_type(repo_id, repo_type=None, **hub_kwargs):
 """
 
 
+def validate_after_init(cls):
+    original_init = cls.__init__
+
+    @wraps(original_init)
+    def new_init(self, *args, **kwargs):
+        original_init(self, *args, **kwargs)
+        if not isinstance(self, PipelineTool):
+            self.validate_arguments()
+
+    cls.__init__ = new_init
+    return cls
+
+
+@validate_after_init
 class Tool:
     """
     A base class for the functions used by the agent. Subclass this and implement the `__call__` method as well as the
@@ -114,17 +131,35 @@ class Tool:
     def __init__(self, *args, **kwargs):
         self.is_initialized = False
 
-    def validate_attributes(self):
+    def validate_arguments(self):
         required_attributes = {
             "description": str,
             "name": str,
             "inputs": Dict,
-            "output_type": type,
+            "output_type": str,
         }
+        authorized_types = ["string", "integer", "number", "image", "audio", "any"]
+
         for attr, expected_type in required_attributes.items():
             attr_value = getattr(self, attr, None)
             if not isinstance(attr_value, expected_type):
-                raise TypeError(f"Instance attribute {attr} must exist and be of type {expected_type.__name__}")
+                raise TypeError(f"You must set an attribute {attr} of type {expected_type.__name__}.")
+        for input_name, input_content in self.inputs.items():
+            assert "type" in input_content, f"Input '{input_name}' should specify a type."
+            if input_content["type"] not in authorized_types:
+                raise Exception(
+                    f"Input '{input_name}': type '{input_content['type']}' is not an authorized value, should be one of {authorized_types}."
+                )
+            assert "description" in input_content, f"Input '{input_name}' should have a description."
+
+        assert getattr(self, "output_type", None) in authorized_types
+
+        if not isinstance(self, PipelineTool):
+            signature = inspect.signature(self.forward)
+            if not set(signature.parameters.keys()) == set(self.inputs.keys()):
+                raise Exception(
+                    "Tool's 'forward' method should take 'self' as its first argument, then its next arguments should match the keys of tool attribute 'inputs'."
+                )
 
     def forward(self, *args, **kwargs):
         return NotImplemented("Write this method in your subclass of `Tool`.")
@@ -382,7 +417,7 @@ def __init__(self, _gradio_tool):
                 super().__init__()
                 self.name = _gradio_tool.name
                 self.description = _gradio_tool.description
-                self.output_type = "text"
+                self.output_type = "string"
                 self._gradio_tool = _gradio_tool
                 func_args = list(inspect.signature(_gradio_tool.run).parameters.keys())
                 self.inputs = {key: "" for key in func_args}
@@ -404,7 +439,7 @@ def __init__(self, _langchain_tool):
                 self.name = _langchain_tool.name.lower()
                 self.description = _langchain_tool.description
                 self.inputs = parse_langchain_args(_langchain_tool.args)
-                self.output_type = "text"
+                self.output_type = "string"
                 self.langchain_tool = _langchain_tool
 
             def forward(self, *args, **kwargs):
@@ -421,6 +456,7 @@ def forward(self, *args, **kwargs):
 DEFAULT_TOOL_DESCRIPTION_TEMPLATE = """
 - {{ tool.name }}: {{ tool.description }}
     Takes inputs: {{tool.inputs}}
+    Returns an output of type: {{tool.output_type}}
 """
 
 
@@ -621,18 +657,18 @@ def fn(*args, **kwargs):
     gradio_inputs = []
     for input_name, input_details in tool_class.inputs.items():
         input_type = input_details["type"]
-        if input_type == "text":
-            gradio_inputs.append(gr.Textbox(label=input_name))
-        elif input_type == "image":
+        if input_type == "image":
             gradio_inputs.append(gr.Image(label=input_name))
         elif input_type == "audio":
             gradio_inputs.append(gr.Audio(label=input_name))
+        elif input_type in ["string", "integer", "number"]:
+            gradio_inputs.append(gr.Textbox(label=input_name))
         else:
             error_message = f"Input type '{input_type}' not supported."
             raise ValueError(error_message)
 
     gradio_output = tool_class.output_type
-    assert gradio_output in ["text", "image", "audio"], f"Output type '{gradio_output}' not supported."
+    assert gradio_output in ["string", "image", "audio"], f"Output type '{gradio_output}' not supported."
 
     gr.Interface(
         fn=fn,
@@ -808,3 +844,37 @@ def __init__(self, collection_slug: str, token: Optional[str] = None):
         self._collection = get_collection(collection_slug, token=token)
         self._hub_repo_ids = {item.item_id for item in self._collection.items if item.item_type == "space"}
         self.tools = {Tool.from_hub(repo_id) for repo_id in self._hub_repo_ids}
+
+
+def tool(tool_function: Callable) -> Tool:
+    """
+    Converts a function into an instance of a Tool subclass.
+
+    Args:
+        tool_function: Your function. Should have type hints for each input and a type hint for the output.
+        Should also have a docstring description including an 'Args:' part where each argument is described.
+    """
+    parameters = get_json_schema(tool_function)["function"]
+    if "return" not in parameters:
+        raise TypeHintParsingException("Tool return type not found: make sure your function has a return type hint!")
+    class_name = f"{parameters['name'].capitalize()}Tool"
+
+    class SpecificTool(Tool):
+        name = parameters["name"]
+        description = parameters["description"]
+        inputs = parameters["parameters"]["properties"]
+        output_type = parameters["return"]["type"]
+
+        @wraps(tool_function)
+        def forward(self, *args, **kwargs):
+            return tool_function(*args, **kwargs)
+
+    original_signature = inspect.signature(tool_function)
+    new_parameters = [inspect.Parameter("self", inspect.Parameter.POSITIONAL_OR_KEYWORD)] + list(
+        original_signature.parameters.values()
+    )
+    new_signature = original_signature.replace(parameters=new_parameters)
+    SpecificTool.forward.__signature__ = new_signature
+
+    SpecificTool.__name__ = class_name
+    return SpecificTool()
diff --git a/src/transformers/agents/translation.py b/src/transformers/agents/translation.py
index efc97c6e0b20..7ae61f9679b8 100644
--- a/src/transformers/agents/translation.py
+++ b/src/transformers/agents/translation.py
@@ -249,17 +249,17 @@ class TranslationTool(PipelineTool):
     model_class = AutoModelForSeq2SeqLM
 
     inputs = {
-        "text": {"type": "text", "description": "The text to translate"},
+        "text": {"type": "string", "description": "The text to translate"},
         "src_lang": {
-            "type": "text",
+            "type": "string",
             "description": "The language of the text to translate. Written in plain English, such as 'Romanian', or 'Albanian'",
         },
         "tgt_lang": {
-            "type": "text",
+            "type": "string",
             "description": "The language for the desired ouput language. Written in plain English, such as 'Romanian', or 'Albanian'",
         },
     }
-    output_type = "text"
+    output_type = "string"
 
     def encode(self, text, src_lang, tgt_lang):
         if src_lang not in self.lang_to_code:
diff --git a/src/transformers/utils/chat_template_utils.py b/src/transformers/utils/chat_template_utils.py
index aabaf4a36665..74912ce30146 100644
--- a/src/transformers/utils/chat_template_utils.py
+++ b/src/transformers/utils/chat_template_utils.py
@@ -22,7 +22,7 @@
 
 from packaging import version
 
-from .import_utils import is_jinja_available
+from .import_utils import is_jinja_available, is_torch_available, is_vision_available
 
 
 if is_jinja_available():
@@ -32,6 +32,12 @@
 else:
     jinja2 = None
 
+if is_vision_available():
+    from PIL.Image import Image
+
+if is_torch_available():
+    from torch import Tensor
+
 
 BASIC_TYPES = (int, float, str, bool, Any, type(None), ...)
 # Extracts the initial segment of the docstring, containing the function description
@@ -70,6 +76,8 @@ def _get_json_schema_type(param_type: str) -> Dict[str, str]:
         float: {"type": "number"},
         str: {"type": "string"},
         bool: {"type": "boolean"},
+        Image: {"type": "image"},
+        Tensor: {"type": "audio"},
         Any: {},
     }
     return type_mapping.get(param_type, {"type": "object"})
diff --git a/tests/agents/test_agents.py b/tests/agents/test_agents.py
index 67cb31b7dac3..4f24abbeedd8 100644
--- a/tests/agents/test_agents.py
+++ b/tests/agents/test_agents.py
@@ -68,7 +68,6 @@ def fake_react_code_llm(messages, stop_sequences=None, grammar=None) -> str:
 Code:
 ```py
 result = 2**3.6452
-print(result)
 ```<end_code>
 """
     else:  # We're at step 2
@@ -181,7 +180,6 @@ def test_fake_react_code_agent(self):
         assert isinstance(output, float)
         assert output == 7.2904
         assert agent.logs[0]["task"] == "What is 2 multiplied by 3.6452?"
-        assert float(agent.logs[1]["observation"].strip()) - 12.511648 < 1e-6
         assert agent.logs[2]["tool_call"] == {
             "tool_arguments": "final_answer(7.2904)",
             "tool_name": "code interpreter",
@@ -234,7 +232,7 @@ def test_init_agent_with_different_toolsets(self):
 
         # check that python_interpreter base tool does not get added to code agents
         agent = ReactCodeAgent(tools=[], llm_engine=fake_react_code_llm, add_base_tools=True)
-        assert len(agent.toolbox.tools) == 6  # added final_answer tool + 5 base tools (excluding interpreter)
+        assert len(agent.toolbox.tools) == 7  # added final_answer tool + 6 base tools (excluding interpreter)
 
     def test_function_persistence_across_steps(self):
         agent = ReactCodeAgent(
diff --git a/tests/agents/test_final_answer.py b/tests/agents/test_final_answer.py
index 59d5dec84b57..91bdd65e89a8 100644
--- a/tests/agents/test_final_answer.py
+++ b/tests/agents/test_final_answer.py
@@ -19,8 +19,9 @@
 import numpy as np
 from PIL import Image
 
-from transformers import is_torch_available, load_tool
+from transformers import is_torch_available
 from transformers.agents.agent_types import AGENT_TYPE_MAPPING
+from transformers.agents.default_tools import FinalAnswerTool
 from transformers.testing_utils import get_tests_dir, require_torch
 
 from .test_tools_common import ToolTesterMixin
@@ -33,8 +34,7 @@
 class FinalAnswerToolTester(unittest.TestCase, ToolTesterMixin):
     def setUp(self):
         self.inputs = {"answer": "Final answer"}
-        self.tool = load_tool("final_answer")
-        self.tool.setup()
+        self.tool = FinalAnswerTool()
 
     def test_exact_match_arg(self):
         result = self.tool("Final answer")
@@ -52,7 +52,7 @@ def create_inputs(self):
             )
         }
         inputs_audio = {"answer": torch.Tensor(np.ones(3000))}
-        return {"text": inputs_text, "image": inputs_image, "audio": inputs_audio}
+        return {"string": inputs_text, "image": inputs_image, "audio": inputs_audio}
 
     @require_torch
     def test_agent_type_output(self):
diff --git a/tests/agents/test_python_interpreter.py b/tests/agents/test_python_interpreter.py
index 84710cfec685..15e5ad7bb3a3 100644
--- a/tests/agents/test_python_interpreter.py
+++ b/tests/agents/test_python_interpreter.py
@@ -391,8 +391,9 @@ def test_if_conditions(self):
         code = """char='a'
 if char.isalpha():
     print('2')"""
-        result = evaluate_python_code(code, BASE_PYTHON_TOOLS, state={})
-        assert result == "2"
+        state = {}
+        evaluate_python_code(code, BASE_PYTHON_TOOLS, state=state)
+        assert state["print_outputs"] == "2\n"
 
     def test_imports(self):
         code = "import math\nmath.sqrt(4)"
@@ -469,7 +470,7 @@ def test_print_output(self):
         code = "print('Hello world!')\nprint('Ok no one cares')"
         state = {}
         result = evaluate_python_code(code, BASE_PYTHON_TOOLS, state=state)
-        assert result == "Ok no one cares"
+        assert result is None
         assert state["print_outputs"] == "Hello world!\nOk no one cares\n"
 
         # test print in function
@@ -593,8 +594,7 @@ def method_that_raises(self):
     def test_print(self):
         code = "print(min([1, 2, 3]))"
         state = {}
-        result = evaluate_python_code(code, {"min": min, "print": print}, state=state)
-        assert result == "1"
+        evaluate_python_code(code, {"min": min, "print": print}, state=state)
         assert state["print_outputs"] == "1\n"
 
     def test_types_as_objects(self):
diff --git a/tests/agents/test_tools_common.py b/tests/agents/test_tools_common.py
index 679473d0f24b..8226e7109884 100644
--- a/tests/agents/test_tools_common.py
+++ b/tests/agents/test_tools_common.py
@@ -12,13 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import unittest
 from pathlib import Path
 from typing import Dict, Union
 
 import numpy as np
+import pytest
 
 from transformers import is_torch_available, is_vision_available
 from transformers.agents.agent_types import AGENT_TYPE_MAPPING, AgentAudio, AgentImage, AgentText
+from transformers.agents.tools import Tool, tool
 from transformers.testing_utils import get_tests_dir, is_agent_test
 
 
@@ -29,7 +32,7 @@
     from PIL import Image
 
 
-AUTHORIZED_TYPES = ["text", "audio", "image", "any"]
+AUTHORIZED_TYPES = ["string", "boolean", "integer", "number", "audio", "image", "any"]
 
 
 def create_inputs(tool_inputs: Dict[str, Dict[Union[str, type], str]]):
@@ -38,7 +41,7 @@ def create_inputs(tool_inputs: Dict[str, Dict[Union[str, type], str]]):
     for input_name, input_desc in tool_inputs.items():
         input_type = input_desc["type"]
 
-        if input_type == "text":
+        if input_type == "string":
             inputs[input_name] = "Text input"
         elif input_type == "image":
             inputs[input_name] = Image.open(
@@ -54,7 +57,7 @@ def create_inputs(tool_inputs: Dict[str, Dict[Union[str, type], str]]):
 
 def output_type(output):
     if isinstance(output, (str, AgentText)):
-        return "text"
+        return "string"
     elif isinstance(output, (Image.Image, AgentImage)):
         return "image"
     elif isinstance(output, (torch.Tensor, AgentAudio)):
@@ -100,3 +103,69 @@ def test_agent_types_inputs(self):
         for _input, expected_input in zip(inputs, self.tool.inputs.values()):
             input_type = expected_input["type"]
             _inputs.append(AGENT_TYPE_MAPPING[input_type](_input))
+
+
+class ToolTests(unittest.TestCase):
+    def test_tool_init_with_decorator(self):
+        @tool
+        def coolfunc(a: str, b: int) -> float:
+            """Cool function
+
+            Args:
+                a: The first argument
+                b: The second one
+            """
+            return b + 2, a
+
+        assert coolfunc.output_type == "number"
+
+    def test_tool_init_vanilla(self):
+        class HFModelDownloadsTool(Tool):
+            name = "model_download_counter"
+            description = """
+            This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
+            It returns the name of the checkpoint."""
+
+            inputs = {
+                "task": {
+                    "type": "string",
+                    "description": "the task category (such as text-classification, depth-estimation, etc)",
+                }
+            }
+            output_type = "integer"
+
+            def forward(self, task):
+                return "best model"
+
+        tool = HFModelDownloadsTool()
+        assert list(tool.inputs.keys())[0] == "task"
+
+    def test_tool_init_decorator_raises_issues(self):
+        with pytest.raises(Exception) as e:
+
+            @tool
+            def coolfunc(a: str, b: int):
+                """Cool function
+
+                Args:
+                    a: The first argument
+                    b: The second one
+                """
+                return a + b
+
+            assert coolfunc.output_type == "number"
+        assert "Tool return type not found" in str(e)
+
+        with pytest.raises(Exception) as e:
+
+            @tool
+            def coolfunc(a: str, b: int) -> int:
+                """Cool function
+
+                Args:
+                    a: The first argument
+                """
+                return b + a
+
+            assert coolfunc.output_type == "number"
+        assert "docstring has no description for the argument" in str(e)

From 52e22cbf677f58a1cca347117e8974fa50c9f2d7 Mon Sep 17 00:00:00 2001
From: Duygu Altinok <duygu.altinok12@gmail.com>
Date: Wed, 18 Sep 2024 13:32:02 +0300
Subject: [PATCH 33/67] Fix for slow the bug tokenizer adding spaces to single
 id decodes (#32564)

* _decode signature change and quick return

* added bunch of decoding tests

* signature match and return

* added tests for decoding

* merged decoding test

* more tests for special tokens

* cosmetics

* fixed param

* ruffed the file

* refinement for single special tokens

* added test for single special tokens

* slight change to test name

Co-authored-by: Ita Zaporozhets <31893021+itazap@users.noreply.github.com>

* minor change test name for skip tokens

Co-authored-by: Ita Zaporozhets <31893021+itazap@users.noreply.github.com>

* killed already defined var

Co-authored-by: Ita Zaporozhets <31893021+itazap@users.noreply.github.com>

* minor update with vars

Co-authored-by: Ita Zaporozhets <31893021+itazap@users.noreply.github.com>

* killed already defined var once more

Co-authored-by: Ita Zaporozhets <31893021+itazap@users.noreply.github.com>

---------

Co-authored-by: Ita Zaporozhets <31893021+itazap@users.noreply.github.com>
---
 src/transformers/tokenization_utils.py        |  8 ++-
 tests/tokenization/test_tokenization_utils.py | 65 +++++++++++++++++++
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 6a5bff3679f8..df13a029a6c6 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -1077,7 +1077,7 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
 
     def _decode(
         self,
-        token_ids: List[int],
+        token_ids: Union[int, List[int]],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = None,
         spaces_between_special_tokens: bool = True,
@@ -1086,6 +1086,10 @@ def _decode(
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        # If given is a single id, prevents splitting the string in upcoming loop
+        if isinstance(filtered_tokens, str):
+            filtered_tokens = [filtered_tokens]
+
         legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
             token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
         }
@@ -1096,7 +1100,7 @@ def _decode(
         current_sub_text = []
         # TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string
         for token in filtered_tokens:
-            if skip_special_tokens and token in self.all_special_ids:
+            if skip_special_tokens and token in self.all_special_tokens:
                 continue
             if token in legacy_added_tokens:
                 if current_sub_text:
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index b43923df84d7..2c8f71ba9772 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -253,6 +253,71 @@ def test_padding_accepts_tensors(self):
         self.assertTrue(isinstance(batch["input_ids"], np.ndarray))
         self.assertEqual(batch["input_ids"].tolist(), [[0, 1, 2, tokenizer.pad_token_id], [0, 1, 2, 3]])
 
+    @require_tokenizers
+    def test_decoding_single_token(self):
+        for tokenizer_class in [BertTokenizer, BertTokenizerFast]:
+            with self.subTest(f"{tokenizer_class}"):
+                tokenizer = tokenizer_class.from_pretrained("google-bert/bert-base-cased")
+
+                token_id = 2300
+                decoded_flat = tokenizer.decode(token_id)
+                decoded_list = tokenizer.decode([token_id])
+
+                self.assertEqual(decoded_flat, "Force")
+                self.assertEqual(decoded_list, "Force")
+
+                token_id = 0
+                decoded_flat = tokenizer.decode(token_id)
+                decoded_list = tokenizer.decode([token_id])
+
+                self.assertEqual(decoded_flat, "[PAD]")
+                self.assertEqual(decoded_list, "[PAD]")
+
+                last_item_id = tokenizer.vocab_size - 1
+                decoded_flat = tokenizer.decode(last_item_id)
+                decoded_list = tokenizer.decode([last_item_id])
+
+                self.assertEqual(decoded_flat, "##：")
+                self.assertEqual(decoded_list, "##：")
+
+    @require_tokenizers
+    def test_decoding_skip_special_tokens(self):
+        for tokenizer_class in [BertTokenizer, BertTokenizerFast]:
+            with self.subTest(f"{tokenizer_class}"):
+                tokenizer = tokenizer_class.from_pretrained("google-bert/bert-base-cased")
+                tokenizer.add_tokens(["ஐ"], special_tokens=True)
+
+                # test special token with other tokens, skip the special tokens
+                sentence = "This is a beautiful flower ஐ"
+                ids = tokenizer(sentence)["input_ids"]
+                decoded_sent = tokenizer.decode(ids, skip_special_tokens=True)
+                self.assertEqual(decoded_sent, "This is a beautiful flower")
+
+                # test special token with other tokens, do not skip the special tokens
+                ids = tokenizer(sentence)["input_ids"]
+                decoded_sent = tokenizer.decode(ids, skip_special_tokens=False)
+                self.assertEqual(decoded_sent, "[CLS] This is a beautiful flower ஐ [SEP]")
+
+                # test special token stand alone, skip the special tokens
+                sentence = "ஐ"
+                ids = tokenizer(sentence)["input_ids"]
+                decoded_sent = tokenizer.decode(ids, skip_special_tokens=True)
+                self.assertEqual(decoded_sent, "")
+
+                # test special token stand alone, do not skip the special tokens
+                ids = tokenizer(sentence)["input_ids"]
+                decoded_sent = tokenizer.decode(ids, skip_special_tokens=False)
+                self.assertEqual(decoded_sent, "[CLS] ஐ [SEP]")
+
+                # test single special token alone, skip
+                pad_id = 0
+                decoded_sent = tokenizer.decode(pad_id, skip_special_tokens=True)
+                self.assertEqual(decoded_sent, "")
+
+                # test single special token alone, do not skip
+                decoded_sent = tokenizer.decode(pad_id, skip_special_tokens=False)
+                self.assertEqual(decoded_sent, "[PAD]")
+
     @require_torch
     def test_padding_accepts_tensors_pt(self):
         import torch

From db72894b48990860a9b8d1ebbfdd9a0342309b8f Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Wed, 18 Sep 2024 13:00:44 +0200
Subject: [PATCH 34/67] Chat template: save and load correctly for processors
 (#33462)

* fix

* add tests

* fix tests

* Update tests/models/llava/test_processor_llava.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fix

* fix tests

* update tests

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/processing_utils.py          |  9 ++--
 tests/models/llava/test_processor_llava.py    | 29 +++++++++--
 .../llava_next/test_processor_llava_next.py   | 49 ++++++++++++++++++-
 .../test_processing_llava_onevision.py        | 29 ++++++++++-
 4 files changed, 106 insertions(+), 10 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index ddf40c4fe6d4..8b33c456924d 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -502,9 +502,12 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
         output_chat_template_file = os.path.join(save_directory, CHAT_TEMPLATE_NAME)
 
         processor_dict = self.to_dict()
-        chat_template = processor_dict.pop("chat_template", None)
-        if chat_template is not None:
-            chat_template_json_string = json.dumps({"chat_template": chat_template}, indent=2, sort_keys=True) + "\n"
+        # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict`
+        # to avoid serializing chat template in json config file. So let's get it from `self` directly
+        if self.chat_template is not None:
+            chat_template_json_string = (
+                json.dumps({"chat_template": self.chat_template}, indent=2, sort_keys=True) + "\n"
+            )
             with open(output_chat_template_file, "w", encoding="utf-8") as writer:
                 writer.write(chat_template_json_string)
             logger.info(f"chat template saved in {output_chat_template_file}")
diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py
index 5b05a8b92ea5..e62769e34509 100644
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import shutil
 import tempfile
 import unittest
@@ -32,11 +33,11 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
+
         image_processor = CLIPImageProcessor(do_center_crop=False)
         tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
-
-        processor = LlavaProcessor(image_processor=image_processor, tokenizer=tokenizer)
-
+        processor_kwargs = self.prepare_processor_dict()
+        processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs)
         processor.save_pretrained(self.tmpdirname)
 
     def get_tokenizer(self, **kwargs):
@@ -48,6 +49,28 @@ def get_image_processor(self, **kwargs):
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
+    def prepare_processor_dict(self):
+        return {"chat_template": "dummy_template"}
+
+    @unittest.skip(
+        "Skip because the model has no processor kwargs except for chat template and"
+        "chat template is saved as a separate file. Stop skipping this test when the processor"
+        "has new kwargs saved in config file."
+    )
+    def test_processor_to_json_string(self):
+        pass
+
+    def test_chat_template_is_saved(self):
+        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
+        processor_dict_loaded = json.loads(processor_loaded.to_json_string())
+        # chat templates aren't serialized to json in processors
+        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+
+        # they have to be saved as separate file and loaded back from that file
+        # so we check if the same template is loaded
+        processor_dict = self.prepare_processor_dict()
+        self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
+
     def test_can_load_various_tokenizers(self):
         for checkpoint in ["Intel/llava-gemma-2b", "llava-hf/llava-1.5-7b-hf"]:
             processor = LlavaProcessor.from_pretrained(checkpoint)
diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py
index c8b58ce7982f..450034f4151d 100644
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -11,20 +11,65 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
+import tempfile
 import unittest
 
 import torch
 
+from transformers import AutoProcessor, LlamaTokenizerFast, LlavaNextProcessor
 from transformers.testing_utils import require_vision
 from transformers.utils import is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
-    from transformers import AutoProcessor
+    from transformers import CLIPImageProcessor
 
 
 @require_vision
-class LlavaProcessorTest(unittest.TestCase):
+class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = LlavaNextProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        image_processor = CLIPImageProcessor()
+        tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
+        processor_kwargs = self.prepare_processor_dict()
+        processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return LlavaNextProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def prepare_processor_dict(self):
+        return {"chat_template": "dummy_template"}
+
+    @unittest.skip(
+        "Skip because the model has no processor kwargs except for chat template and"
+        "chat template is saved as a separate file. Stop skipping this test when the processor"
+        "has new kwargs saved in config file."
+    )
+    def test_processor_to_json_string(self):
+        pass
+
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
+    def test_chat_template_is_saved(self):
+        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
+        processor_dict_loaded = json.loads(processor_loaded.to_json_string())
+        # chat templates aren't serialized to json in processors
+        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+
+        # they have to be saved as separate file and loaded back from that file
+        # so we check if the same template is loaded
+        processor_dict = self.prepare_processor_dict()
+        self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
+
     def test_chat_template(self):
         processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-7b-hf")
         expected_prompt = "USER: <image>\nWhat is shown in this image? ASSISTANT:"
diff --git a/tests/models/llava_onevision/test_processing_llava_onevision.py b/tests/models/llava_onevision/test_processing_llava_onevision.py
index e045f2ba7f0b..f747c18250b6 100644
--- a/tests/models/llava_onevision/test_processing_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processing_llava_onevision.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import shutil
 import tempfile
 import unittest
@@ -40,9 +41,10 @@ def setUp(self):
         image_processor = LlavaOnevisionImageProcessor()
         video_processor = LlavaOnevisionVideoProcessor()
         tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+        processor_kwargs = self.prepare_processor_dict()
 
         processor = LlavaOnevisionProcessor(
-            video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer
+            video_processor=video_processor, image_processor=image_processor, tokenizer=tokenizer, **processor_kwargs
         )
         processor.save_pretrained(self.tmpdirname)
 
@@ -52,9 +54,32 @@ def get_tokenizer(self, **kwargs):
     def get_image_processor(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
-    def get_Video_processor(self, **kwargs):
+    def get_video_processor(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
 
+    def prepare_processor_dict(self):
+        return {"chat_template": "dummy_template"}
+
+    @unittest.skip(
+        "Skip because the model has no processor kwargs except for chat template and"
+        "chat template is saved as a separate file. Stop skipping this test when the processor"
+        "has new kwargs saved in config file."
+    )
+    def test_processor_to_json_string(self):
+        pass
+
+    # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
+    def test_chat_template_is_saved(self):
+        processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
+        processor_dict_loaded = json.loads(processor_loaded.to_json_string())
+        # chat templates aren't serialized to json in processors
+        self.assertFalse("chat_template" in processor_dict_loaded.keys())
+
+        # they have to be saved as separate file and loaded back from that file
+        # so we check if the same template is loaded
+        processor_dict = self.prepare_processor_dict()
+        self.assertTrue(processor_loaded.chat_template == processor_dict.get("chat_template", None))
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 

From 9f2b8cc45afa67fd3962d777fc02061531159b9e Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 18 Sep 2024 21:46:12 +0800
Subject: [PATCH 35/67] Fix missing head_dim in llama config from gguf model
 (#33526)

fix missing head_dim in llama config from gguf
---
 src/transformers/integrations/ggml.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index b5471574a13d..89d4b29de774 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -120,7 +120,8 @@
         "block_count": "num_hidden_layers",
         "feed_forward_length": "intermediate_size",
         "embedding_length": "hidden_size",
-        "rope.dimension_count": None,
+        # NOTE: rope.dimension_count==head_dim only suitable for llama/mistral
+        "rope.dimension_count": "head_dim",
         "rope.freq_base": "rope_theta",
         "attention.head_count": "num_attention_heads",
         "attention.head_count_kv": "num_key_value_heads",
@@ -132,7 +133,8 @@
         "block_count": "num_hidden_layers",
         "feed_forward_length": "intermediate_size",
         "embedding_length": "hidden_size",
-        "rope.dimension_count": None,
+        # NOTE: rope.dimension_count==head_dim only suitable for llama/mistral
+        "rope.dimension_count": "head_dim",
         "rope.freq_base": "rope_theta",
         "attention.head_count": "num_attention_heads",
         "attention.head_count_kv": "num_key_value_heads",

From 5427eaad4321e40ba3a8697f91f4133c73e08614 Mon Sep 17 00:00:00 2001
From: Ikram Ali <mrikram1989@gmail.com>
Date: Wed, 18 Sep 2024 18:49:19 +0500
Subject: [PATCH 36/67] [i18n-ur] Added README_ur.md file (#33461)

* Urdu docs added

* fixed the misaligned issue.
---
 README.md              |   3 +-
 i18n/README_ar.md      |   1 +
 i18n/README_de.md      |   1 +
 i18n/README_es.md      |   1 +
 i18n/README_fr.md      |   1 +
 i18n/README_hd.md      |   1 +
 i18n/README_ja.md      |   1 +
 i18n/README_ko.md      |   1 +
 i18n/README_pt-br.md   |   1 +
 i18n/README_ru.md      |   1 +
 i18n/README_te.md      |   1 +
 i18n/README_ur.md      | 333 +++++++++++++++++++++++++++++++++++++++++
 i18n/README_vi.md      |   1 +
 i18n/README_zh-hans.md |   1 +
 i18n/README_zh-hant.md |   1 +
 15 files changed, 348 insertions(+), 1 deletion(-)
 create mode 100644 i18n/README_ur.md

diff --git a/README.md b/README.md
index f80835534496..a2325ae03762 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,8 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
-	<a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_ar.md b/i18n/README_ar.md
index 60ec4e1c0689..c2dd588fdb23 100644
--- a/i18n/README_ar.md
+++ b/i18n/README_ar.md
@@ -49,6 +49,7 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
 		<b>العربية</b> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_de.md b/i18n/README_de.md
index 7128a9ad999f..2532c9e12fab 100644
--- a/i18n/README_de.md
+++ b/i18n/README_de.md
@@ -49,6 +49,7 @@ limitations under the License.
         <b>Deutsch</b> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_es.md b/i18n/README_es.md
index fdef68ff1b4c..6682147d7867 100644
--- a/i18n/README_es.md
+++ b/i18n/README_es.md
@@ -44,6 +44,7 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_fr.md b/i18n/README_fr.md
index 2c78ba041db2..c1eaa10edb92 100644
--- a/i18n/README_fr.md
+++ b/i18n/README_fr.md
@@ -49,6 +49,7 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_hd.md b/i18n/README_hd.md
index a6e017a6f833..07077e5dd9c3 100644
--- a/i18n/README_hd.md
+++ b/i18n/README_hd.md
@@ -69,6 +69,7 @@ checkpoint: जाँच बिंदु
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_ja.md b/i18n/README_ja.md
index 27b770869f71..293a5ee111b0 100644
--- a/i18n/README_ja.md
+++ b/i18n/README_ja.md
@@ -79,6 +79,7 @@ user: ユーザ
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_ko.md b/i18n/README_ko.md
index 283318478f4b..e2a9b80d0d3e 100644
--- a/i18n/README_ko.md
+++ b/i18n/README_ko.md
@@ -44,6 +44,7 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
 
     </p>
 </h4>
diff --git a/i18n/README_pt-br.md b/i18n/README_pt-br.md
index a356caefba9b..79007e5aaa33 100644
--- a/i18n/README_pt-br.md
+++ b/i18n/README_pt-br.md
@@ -49,6 +49,7 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_ru.md b/i18n/README_ru.md
index fe548c100114..759acdbb9127 100644
--- a/i18n/README_ru.md
+++ b/i18n/README_ru.md
@@ -49,6 +49,7 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     <p>
 </h4>
 
diff --git a/i18n/README_te.md b/i18n/README_te.md
index 9dbd522c463d..feb537ad1a48 100644
--- a/i18n/README_te.md
+++ b/i18n/README_te.md
@@ -51,6 +51,7 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_ur.md b/i18n/README_ur.md
new file mode 100644
index 000000000000..e14c87077707
--- /dev/null
+++ b/i18n/README_ur.md
@@ -0,0 +1,333 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
+    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
+    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
+  </picture>
+  <br/>
+  <br/>
+</p>
+
+<p align="center">
+    <a href="https://circleci.com/gh/huggingface/transformers"><img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main"></a>
+    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE"><img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue"></a>
+    <a href="https://huggingface.co/docs/transformers/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online"></a>
+    <a href="https://github.com/huggingface/transformers/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg"></a>
+    <a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md"><img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg"></a>
+    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
+</p>
+
+<h4 align="center">
+    <p>
+        <a href="https://github.com/huggingface/transformers/blob/main/README.md">English</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_zh-hans.md">简体中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_zh-hant.md">繁體中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ko.md">한국어</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_es.md">Español</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <b>اردو</b> |
+    </p>
+</h4>
+
+<h3 align="center">
+    <p>جدید ترین مشین لرننگ برائے JAX، PyTorch اور TensorFlow</p>
+</h3>
+
+<h3 align="center">
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
+</h3>
+
+&#8207;🤗 Transformers مختلف طریقوں جیسے کہ متن، بصارت، اور آڈیو پر کام کرنے کے لیے ہزاروں پری ٹرینڈ ماڈلز فراہم کرتے ہیں۔
+
+یہ ماڈلز درج ذیل پر لاگو کیے جا سکتے ہیں:
+
+* 📝 متن، جیسے کہ متن کی درجہ بندی، معلومات کا استخراج، سوالات کے جوابات، خلاصہ، ترجمہ، اور متن کی تخلیق، 100 سے زائد زبانوں میں۔
+* 🖼️ تصاویر، جیسے کہ تصویر کی درجہ بندی، اشیاء کی شناخت، اور تقسیم۔
+* 🗣️ آڈیو، جیسے کہ تقریر کی شناخت اور آڈیو کی درجہ بندی۔
+
+ٹرانسفارمر ماڈلز **مختلف طریقوں کو ملا کر** بھی کام انجام دے سکتے ہیں، جیسے کہ ٹیبل سوال جواب، بصری حروف کی شناخت، اسکین شدہ دستاویزات سے معلومات نکالنا، ویڈیو کی درجہ بندی، اور بصری سوال جواب۔
+
+&#8207;🤗 Transformers ایسے APIs فراہم کرتا ہے جو آپ کو تیز رفتاری سے پری ٹرینڈ ماڈلز کو ایک دیے گئے متن پر ڈاؤن لوڈ اور استعمال کرنے، انہیں اپنے ڈیٹا سیٹس پر فائن ٹون کرنے، اور پھر ہمارے [ماڈل حب](https://huggingface.co/models) پر کمیونٹی کے ساتھ شیئر کرنے کی سہولت دیتا ہے۔ اسی وقت، ہر پائتھن ماڈیول جو ایک آرکیٹیکچر کو بیان کرتا ہے، مکمل طور پر خود مختار ہوتا ہے اور اسے تیز تحقیقاتی تجربات کے لیے تبدیل کیا جا سکتا ہے۔
+
+
+&#8207;🤗 Transformers تین سب سے مشہور ڈیپ لرننگ لائبریریوں — [Jax](https://jax.readthedocs.io/en/latest/)، [PyTorch](https://pytorch.org/) اور [TensorFlow](https://www.tensorflow.org/) — کی مدد سے تیار کردہ ہے، جن کے درمیان بے حد ہموار انضمام ہے۔ اپنے ماڈلز کو ایک کے ساتھ تربیت دینا اور پھر دوسرے کے ساتھ inference کے لیے لوڈ کرنا انتہائی سادہ ہے۔
+
+## آن لائن ڈیمو
+
+آپ ہمارے زیادہ تر ماڈلز کو براہ راست ان کے صفحات پر [ماڈل ہب](https://huggingface.co/models) سے آزما سکتے ہیں۔ ہم عوامی اور نجی ماڈلز کے لیے [ذاتی ماڈل ہوسٹنگ، ورژننگ، اور انفرنس API](https://huggingface.co/pricing) بھی فراہم کرتے ہیں۔
+
+یہاں چند مثالیں ہیں:
+
+قدرتی زبان کی پروسیسنگ میں:
+
+- [&#8207;BERT کے ساتھ ماسک شدہ الفاظ کی تکمیل](https://huggingface.co/google-bert/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [&#8207;Electra کے ساتھ نامزد اداروں کی شناخت](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [&#8207;Mistral کے ساتھ متنی جنریشن](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)
+- [&#8207;RoBERTa کے ساتھ قدرتی زبان کی دلیل](https://huggingface.co/FacebookAI/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [&#8207;BART کے ساتھ خلاصہ کاری](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [&#8207;DistilBERT کے ساتھ سوالات کے جوابات](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [&#8207;T5 کے ساتھ ترجمہ](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+  
+کمپیوٹر وژن میں:
+- [&#8207;ViT کے ساتھ امیج کی درجہ بندی](https://huggingface.co/google/vit-base-patch16-224)
+- [&#8207;DETR کے ساتھ اشیاء کی شناخت](https://huggingface.co/facebook/detr-resnet-50)
+- [&#8207;SegFormer کے ساتھ سیمانٹک سیگمینٹیشن](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+- [&#8207;Mask2Former کے ساتھ پینوسٹک سیگمینٹیشن](https://huggingface.co/facebook/mask2former-swin-large-coco-panoptic)
+- [&#8207;Depth Anything کے ساتھ گہرائی کا اندازہ](https://huggingface.co/docs/transformers/main/model_doc/depth_anything)
+- [&#8207;VideoMAE کے ساتھ ویڈیو کی درجہ بندی](https://huggingface.co/docs/transformers/model_doc/videomae)
+- [&#8207;OneFormer کے ساتھ یونیورسل سیگمینٹیشن](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
+
+
+آڈیو:
+- [خودکار تقریر کی پہچان Whisper کے ساتھ](https://huggingface.co/openai/whisper-large-v3)
+- [کلیدی الفاظ کی تلاش Wav2Vec2 کے ساتھ](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- [آڈیو کی درجہ بندی Audio Spectrogram Transformer کے ساتھ](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
+
+ملٹی ماڈل ٹاسک میں:
+
+- [ٹیبل سوال جواب کے لیے TAPAS](https://huggingface.co/google/tapas-base-finetuned-wtq)
+- [ویژول سوال جواب کے لیے ViLT](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+- [امیج کیپشننگ کے لیے LLaVa](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
+- [زیرو شاٹ امیج کلاسیفیکیشن کے لیے SigLIP](https://huggingface.co/google/siglip-so400m-patch14-384)
+- [دستاویزی سوال جواب کے لیے LayoutLM](https://huggingface.co/impira/layoutlm-document-qa)
+- [زیرو شاٹ ویڈیو کلاسیفیکیشن کے لیے X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)
+- [زیرو شاٹ آبجیکٹ ڈیٹیکشن کے لیے OWLv2](https://huggingface.co/docs/transformers/en/model_doc/owlv2)
+- [زیرو شاٹ امیج سیگمنٹیشن کے لیے CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)
+- [خودکار ماسک جنریشن کے لیے SAM](https://huggingface.co/docs/transformers/model_doc/sam)
+
+
+## ٹرانسفارمرز کے 100 منصوبے
+
+&#8207;🤗 Transformers صرف پیشگی تربیت یافتہ ماڈلز کا ایک ٹول کٹ نہیں ہے: یہ ایک کمیونٹی ہے جو اس کے ارد گرد اور ہیگنگ فیس حب پر تعمیر شدہ منصوبوں کا مجموعہ ہے۔ ہم    چاہتے ہیں کہ🤗 Transformers ترقی کاروں، محققین، طلباء، پروفیسرز، انجینئرز، اور ہر کسی کو اپنے خوابوں کے منصوبے بنانے میں مدد فراہم کرے۔
+
+
+&#8207;🤗 Transformers کے 100,000 ستاروں کی خوشی منانے کے لیے، ہم نے کمیونٹی پر روشنی ڈالنے کا فیصلہ کیا ہے، اور ہم نے [awesome-transformers](./awesome-transformers.md) کا صفحہ بنایا ہے جو 100 شاندار منصوبے درج کرتا ہے جو 🤗 Transformers کے ارد گرد بنائے گئے ہیں۔
+
+اگر آپ کے پاس کوئی ایسا منصوبہ ہے جسے آپ سمجھتے ہیں کہ اس فہرست کا حصہ ہونا چاہیے، تو براہ کرم ایک PR کھولیں تاکہ اسے شامل کیا جا سکے!
+
+## اگر آپ ہیگنگ فیس ٹیم سے حسب ضرورت معاونت تلاش کر رہے ہیں
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## فوری ٹور
+
+دیے گئے ان پٹ (متن، تصویر، آڈیو، ...) پر ماڈل کو فوری طور پر استعمال کرنے کے لیے، ہم pipeline API فراہم کرتے ہیں۔ پائپ لائنز ایک پیشگی تربیت یافتہ ماڈل کو اس ماڈل کی تربیت کے دوران استعمال ہونے والے پری پروسیسنگ کے ساتھ گروپ کرتی ہیں۔ یہاں یہ ہے کہ مثبت اور منفی متون کی درجہ بندی کے لیے پائپ لائن کو جلدی سے کیسے استعمال کیا جائے:
+
+
+```python
+>>> from transformers import pipeline
+
+# جذبات کے تجزیے کے لیے ایک پائپ لائن مختص کریں
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+دوسری لائن کوڈ پائپ لائن کے ذریعہ استعمال ہونے والے پیشگی تربیت یافتہ ماڈل کو ڈاؤن لوڈ اور کیش کرتی ہے، جبکہ تیسری لائن اسے دیے گئے متن پر جانچتی ہے۔ یہاں، جواب "مثبت" ہے جس کی اعتماد کی شرح 99.97% ہے۔
+
+بہت سے کاموں کے لیے ایک پیشگی تربیت یافتہ pipeline تیار ہے، NLP کے علاوہ کمپیوٹر ویژن اور آواز میں بھی۔ مثال کے طور پر، ہم تصویر میں دریافت شدہ اشیاء کو آسانی سے نکال سکتے ہیں:
+
+
+``` python
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import pipeline
+
+# جذبات کے تجزیے کے لیے ایک پائپ لائن مختص کریں
+>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
+>>> image_data = requests.get(url, stream=True).raw
+>>> image = Image.open(image_data)
+
+>>> object_detector = pipeline('object-detection')
+>>> object_detector(image)
+[{'score': 0.9982201457023621،
+  'label': 'remote'،
+  'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}}،
+ {'score': 0.9960021376609802،
+  'label': 'remote'،
+  'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}}،
+ {'score': 0.9954745173454285،
+  'label': 'couch'،
+  'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}}،
+ {'score': 0.9988006353378296،
+  'label': 'cat'،
+  'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}}،
+ {'score': 0.9986783862113953،
+  'label': 'cat'،
+  'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
+```
+
+یہاں، ہم کو تصویر میں دریافت شدہ اشیاء کی فہرست ملتی ہے، ہر ایک کے گرد ایک باکس اور اعتماد کا اسکور۔ یہاں اصل تصویر بائیں طرف ہے، اور پیشگوئیاں دائیں طرف ظاہر کی گئی ہیں:
+
+
+<h3 align="center">
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
+</h3>
+
+آپ `pipeline` API کی مدد سے معاونت شدہ کاموں کے بارے میں مزید جان سکتے ہیں [اس ٹیوٹوریل](https://huggingface.co/docs/transformers/task_summary) میں۔
+
+
+&#8207;`pipeline` کے علاوہ، کسی بھی پیشگی تربیت یافتہ ماڈل کو آپ کے دیے گئے کام پر ڈاؤن لوڈ اور استعمال کرنے کے لیے، صرف تین لائنوں کا کوڈ کافی ہے۔ یہاں PyTorch ورژن ہے:
+
+```python
+>>> from transformers import AutoTokenizer، AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = AutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!"، return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+اور یہاں TensorFlow کے لیے مساوی کوڈ ہے:
+```python
+>>> from transformers import AutoTokenizer، TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("google-bert/bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!"، return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+ٹوکینائزر تمام پری پروسیسنگ کا ذمہ دار ہے جس کی پیشگی تربیت یافتہ ماڈل کو ضرورت ہوتی ہے اور اسے براہ راست ایک واحد سٹرنگ (جیسا کہ اوپر کی مثالوں میں) یا ایک فہرست پر کال کیا جا سکتا ہے۔ یہ ایک لغت فراہم کرے گا جسے آپ ڈاؤن اسٹریم کوڈ میں استعمال کر سکتے ہیں یا سادہ طور پر اپنے ماڈل کو ** دلیل انپیکنگ آپریٹر کے ذریعے براہ راست پاس کر سکتے ہیں۔
+
+ماڈل خود ایک باقاعدہ [PyTorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) یا [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (آپ کے بیک اینڈ پر منحصر ہے) ہے جسے آپ معمول کے مطابق استعمال کر سکتے ہیں۔ [یہ ٹیوٹوریل](https://huggingface.co/docs/transformers/training) وضاحت کرتا ہے کہ کلاسیکی PyTorch یا TensorFlow تربیتی لوپ میں ایسے ماڈل کو کیسے ضم کیا جائے، یا ہمارے `Trainer` API کا استعمال کرتے ہوئے نئے ڈیٹا سیٹ پر جلدی سے فائن ٹیون کیسے کیا جائے۔
+
+## مجھے Transformers کیوں استعمال کرنا چاہیے؟
+
+&#8207; 1. استعمال میں آسان جدید ترین ماڈلز:
+   
+ - قدرتی زبان کی سمجھ اور تخلیق، کمپیوٹر وژن، اور آڈیو کے کاموں میں اعلی کارکردگی۔
+ - معلمین اور عملی ماہرین کے لیے کم داخلی رکاوٹ۔
+ - سیکھنے کے لیے صرف تین کلاسز کے ساتھ چند یوزر فرینڈلی ایبسٹریکشنز۔
+ - ہمارے تمام pretrained ماڈلز کے استعمال کے لیے ایک متحد API۔
+
+&#8207; 2. کمپیوٹیشن کے اخراجات میں کمی، کاربن فٹ پرنٹ میں کمی:
+
+- محققین ہمیشہ دوبارہ تربیت کرنے کی بجائے تربیت شدہ ماڈلز شیئر کر سکتے ہیں۔
+- عملی ماہرین کمپیوٹ وقت اور پروڈکشن اخراجات کو کم کر سکتے ہیں۔
+- ہر موڈیلٹی کے لیے 400,000 سے زیادہ pretrained ماڈلز کے ساتھ درجنوں آرکیٹیکچرز۔
+
+&#8207; 3. ماڈل کے لائف ٹائم کے ہر حصے کے لیے صحیح 
+فریم ورک کا انتخاب کریں:
+
+  - 3 لائنز کے کوڈ میں جدید ترین ماڈلز تربیت دیں۔
+  - ایک ماڈل کو کسی بھی وقت TF2.0/PyTorch/JAX فریم ورکس کے درمیان منتقل کریں۔
+  - تربیت، تشخیص، اور پروڈکشن کے لیے بغیر کسی رکاوٹ کے صحیح فریم ورک کا انتخاب کریں۔
+
+&#8207; 4. اپنے ضروریات کے مطابق آسانی سے ماڈل یا ایک مثال کو حسب ضرورت بنائیں:
+
+  - ہم ہر آرکیٹیکچر کے لیے مثالیں فراہم کرتے ہیں تاکہ اصل مصنفین کے شائع شدہ نتائج کو دوبارہ پیدا کیا جا سکے۔
+  - ماڈلز کی اندرونی تفصیلات کو جتنا ممکن ہو یکساں طور پر ظاہر کیا جاتا ہے۔
+  - فوری تجربات کے لیے ماڈل فائلز کو لائبریری سے آزادانہ طور پر استعمال کیا جا سکتا ہے۔
+
+## مجھے Transformers کیوں استعمال نہیں کرنا چاہیے؟
+
+- یہ لائبریری نیورل نیٹس کے لیے بلڈنگ بلاکس کا ماڈیولر ٹول باکس نہیں ہے۔ ماڈل فائلز میں موجود کوڈ جان بوجھ کر اضافی ایبسٹریکشنز کے ساتھ دوبارہ ترتیب نہیں دیا گیا ہے، تاکہ محققین بغیر اضافی ایبسٹریکشنز/فائلوں میں گئے ہوئے جلدی سے ہر ماڈل پر کام کر سکیں۔
+- تربیتی API کا مقصد کسی بھی ماڈل پر کام کرنے کے لیے نہیں ہے بلکہ یہ لائبریری کے فراہم کردہ ماڈلز کے ساتھ کام کرنے کے لیے بہتر بنایا گیا ہے۔ عام مشین لرننگ لوپس کے لیے، آپ کو دوسری لائبریری (ممکنہ طور پر [Accelerate](https://huggingface.co/docs/accelerate)) استعمال کرنی چاہیے۔
+- حالانکہ ہم جتنا ممکن ہو زیادہ سے زیادہ استعمال کے کیسز پیش کرنے کی کوشش کرتے ہیں، ہمارے [مثالوں کے فولڈر](https://github.com/huggingface/transformers/tree/main/examples) میں موجود اسکرپٹس صرف یہی ہیں: مثالیں۔ یہ توقع کی جاتی ہے کہ یہ آپ کے مخصوص مسئلے پر فوراً کام نہیں کریں گی اور آپ کو اپنی ضروریات کے مطابق کوڈ کی کچھ لائنیں تبدیل کرنی پڑیں گی۔
+
+### انسٹالیشن
+
+#### &#8207; pip کے ساتھ
+
+یہ ریپوزٹری Python 3.8+، Flax 0.4.1+، PyTorch 1.11+، اور TensorFlow 2.6+ پر ٹیسٹ کی گئی ہے۔
+
+آپ کو 🤗 Transformers کو ایک [ورچوئل ماحول](https://docs.python.org/3/library/venv.html) میں انسٹال کرنا چاہیے۔ اگر آپ Python ورچوئل ماحول سے واقف نہیں ہیں، تو [یوزر گائیڈ](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) دیکھیں۔
+
+پہلے، Python کے اس ورژن کے ساتھ ایک ورچوئل ماحول بنائیں جو آپ استعمال کر رہے ہیں اور اسے ایکٹیویٹ کریں۔
+
+پھر، آپ کو کم از کم Flax، PyTorch، یا TensorFlow میں سے کسی ایک کو انسٹال کرنے کی ضرورت ہوگی۔
+براہ کرم اپنے پلیٹ فارم کے لیے مخصوص انسٹالیشن کمانڈ کے حوالے سے [TensorFlow انسٹالیشن صفحہ](https://www.tensorflow.org/install/)، [PyTorch انسٹالیشن صفحہ](https://pytorch.org/get-started/locally/#start-locally) اور/یا [Flax](https://github.com/google/flax#quick-install) اور [Jax](https://github.com/google/jax#installation) انسٹالیشن صفحات دیکھیں۔
+
+جب ان میں سے کوئی ایک بیک اینڈ انسٹال ہو جائے، تو 🤗 Transformers کو pip کے ذریعے مندرجہ ذیل طریقے سے انسٹال کیا جا سکتا ہے:
+
+```bash
+pip install transformers
+```
+
+اگر آپ مثالوں کے ساتھ کھیلنا چاہتے ہیں یا آپ کو کوڈ کا تازہ ترین ورژن چاہیے اور آپ نئے ریلیز کا انتظار نہیں کر سکتے، تو آپ کو [سورس سے لائبریری انسٹال کرنی ہوگی](https://huggingface.co/docs/transformers/installation#installing-from-source)۔
+
+#### &#8207;conda کے ساتھ
+
+&#8207;🤗 Transformers کو conda کے ذریعے مندرجہ ذیل طریقے سے انسٹال کیا جا سکتا ہے:
+
+```shell script
+conda install conda-forge::transformers
+```
+
+> **_نوٹ:_** `transformers` کو `huggingface` چینل سے انسٹال کرنا اب ختم کیا جا چکا ہے۔
+
+Flax، PyTorch، یا TensorFlow کو conda کے ساتھ انسٹال کرنے کے لیے انسٹالیشن صفحات کی پیروی کریں۔
+
+> **_نوٹ:_**  ونڈوز پر، آپ کو کیشنگ سے فائدہ اٹھانے کے لیے ڈویلپر موڈ کو ایکٹیویٹ کرنے کا پیغام دیا جا سکتا ہے۔ اگر یہ آپ کے لیے ممکن نہیں ہے، تو براہ کرم ہمیں [اس مسئلے](https://github.com/huggingface/huggingface_hub/issues/1062) میں بتائیں۔
+
+### ماڈل کی تعمیرات 
+
+&#8207; 🤗 Transformers کی طرف سے فراہم کردہ **[تمام ماڈل چیک پوائنٹس](https://huggingface.co/models)** ہگنگ فیس کے ماڈل حب [model hub](https://huggingface.co/models) سے بآسانی مربوط ہیں، جہاں یہ براہ راست [صارفین](https://huggingface.co/users) اور [تنظیموں](https://huggingface.co/organizations) کے ذریعہ اپ لوڈ کیے جاتے ہیں۔
+
+چیک پوائنٹس کی موجودہ تعداد: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+&#8207;🤗 Transformers فی الحال درج ذیل معماریاں فراہم کرتا ہے: ہر ایک کا اعلی سطحی خلاصہ دیکھنے کے لیے [یہاں](https://huggingface.co/docs/transformers/model_summary) دیکھیں۔
+
+یہ چیک کرنے کے لیے کہ ہر ماڈل کی Flax، PyTorch یا TensorFlow میں کوئی عملداری ہے یا 🤗 Tokenizers لائبریری کے ذریعہ سپورٹ کردہ ٹوکنائزر کے ساتھ ہے، [اس جدول](https://huggingface.co/docs/transformers/index#supported-frameworks) کا حوالہ لیں۔
+
+یہ عملداری مختلف ڈیٹا سیٹس پر ٹیسٹ کی گئی ہیں (مثال کے اسکرپٹس دیکھیں) اور اصل عملداری کی کارکردگی کے ہم آہنگ ہونی چاہئیں۔ آپ کو کارکردگی کی مزید تفصیلات [دستاویزات](https://github.com/huggingface/transformers/tree/main/examples) کے مثالوں کے سیکشن میں مل سکتی ہیں۔
+
+
+## مزید معلومات حاصل کریں
+
+| سیکشن | تفصیل |
+|-|-|
+| [دستاویزات](https://huggingface.co/docs/transformers/) | مکمل API دستاویزات اور ٹیوٹوریلز |
+| [ٹاسک کا خلاصہ](https://huggingface.co/docs/transformers/task_summary) | 🤗 Transformers کے ذریعہ سپورٹ کردہ ٹاسک |
+| [پری پروسیسنگ ٹیوٹوریل](https://huggingface.co/docs/transformers/preprocessing) | ماڈلز کے لیے ڈیٹا تیار کرنے کے لیے `Tokenizer` کلاس کا استعمال |
+| [ٹریننگ اور فائن ٹیوننگ](https://huggingface.co/docs/transformers/training) | PyTorch/TensorFlow ٹریننگ لوپ میں 🤗 Transformers کی طرف سے فراہم کردہ ماڈلز کا استعمال اور `Trainer` API |
+| [تیز دورہ: فائن ٹیوننگ/استعمال کے اسکرپٹس](https://github.com/huggingface/transformers/tree/main/examples) | مختلف قسم کے ٹاسک پر ماڈلز کو فائن ٹیون کرنے کے لیے مثال کے اسکرپٹس |
+| [ماڈل کا اشتراک اور اپ لوڈ کرنا](https://huggingface.co/docs/transformers/model_sharing) | اپنی فائن ٹیون کردہ ماڈلز کو کمیونٹی کے ساتھ اپ لوڈ اور شیئر کریں |
+
+## استشہاد
+
+ہم نے اب ایک [تحقیقی مقالہ](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) تیار کیا ہے جسے آپ 🤗 Transformers لائبریری کے لیے حوالہ دے سکتے ہیں:
+
+```bibtex
+@inproceedings{wolf-etal-2020-transformers،
+    title = "Transformers: State-of-the-Art Natural Language Processing"،
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R{\'e}mi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush"،
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations"،
+    month = oct،
+    year = "2020"،
+    address = "Online"،
+    publisher = "Association for Computational Linguistics"،
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6"،
+    pages = "38--45"
+}
+```
diff --git a/i18n/README_vi.md b/i18n/README_vi.md
index f85dda3e215d..5e5c2ab1e25c 100644
--- a/i18n/README_vi.md
+++ b/i18n/README_vi.md
@@ -49,6 +49,7 @@ limitations under the License.
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <b>Tiếng việt</b> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_zh-hans.md b/i18n/README_zh-hans.md
index f857f50d1a55..61f3a19849ff 100644
--- a/i18n/README_zh-hans.md
+++ b/i18n/README_zh-hans.md
@@ -69,6 +69,7 @@ checkpoint: 检查点
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 
diff --git a/i18n/README_zh-hant.md b/i18n/README_zh-hant.md
index 721e6575dec7..e20798a2d457 100644
--- a/i18n/README_zh-hant.md
+++ b/i18n/README_zh-hant.md
@@ -81,6 +81,7 @@ user: 使用者
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md">Tiếng Việt</a> |
         <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md">العربية</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md">اردو</a> |
     </p>
 </h4>
 

From 4f1e9bae4e0a2d94e8a347964569dd1df385de55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ziy=C3=BA=20Ye?= <ziyuye@uchicago.edu>
Date: Wed, 18 Sep 2024 07:23:05 -0700
Subject: [PATCH 37/67] fix the wandb logging issue (#33464)

* fix the wandb logging issue

* handle ConfigError in WandbCallback; move import to local scope

* update integration_utils.py; move import of ConfigError

* Update integration_utils.py: remove trailing whitespace
---
 src/transformers/integrations/integration_utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 9172f9599f77..40298f9c6fc7 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -803,6 +803,10 @@ def setup(self, args, state, model, **kwargs):
         if self._wandb is None:
             return
         self._initialized = True
+
+        # prepare to handle potential configuration issues during setup
+        from wandb.sdk.lib.config_util import ConfigError as WandbConfigError
+
         if state.is_world_process_zero:
             logger.info(
                 'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"'
@@ -852,7 +856,13 @@ def setup(self, args, state, model, **kwargs):
             try:
                 self._wandb.config["model/num_parameters"] = model.num_parameters()
             except AttributeError:
-                logger.info("Could not log the number of model parameters in Weights & Biases.")
+                logger.info(
+                    "Could not log the number of model parameters in Weights & Biases due to an AttributeError."
+                )
+            except WandbConfigError:
+                logger.warning(
+                    "A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config."
+                )
 
             # log the initial model architecture to an artifact
             if self._log_model.is_enabled:

From f883827c0a0832b9dd53ede18aa7fffe74a1fec2 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Wed, 18 Sep 2024 16:25:45 +0200
Subject: [PATCH 38/67] Fix tests in ASR pipeline (#33545)

---
 ..._pipelines_automatic_speech_recognition.py | 74 +++++++++----------
 1 file changed, 35 insertions(+), 39 deletions(-)

diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index abb07d831ad0..842933d2b76c 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -295,8 +295,8 @@ def test_torch_large(self):
         self.assertEqual(output, {"text": ""})
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        filename = ds[40]["file"]
-        output = speech_recognizer(filename)
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
         self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
 
     @require_torch
@@ -312,8 +312,8 @@ def test_torch_large_with_input_features(self):
         self.assertEqual(output, {"text": ""})
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        filename = ds[40]["file"]
-        output = speech_recognizer(filename)
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
         self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
 
     @slow
@@ -542,11 +542,11 @@ def test_torch_whisper(self):
             framework="pt",
         )
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        filename = ds[40]["file"]
-        output = speech_recognizer(filename)
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
         self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
 
-        output = speech_recognizer([filename], chunk_length_s=5, batch_size=4)
+        output = speech_recognizer([ds[40]["audio"]], chunk_length_s=5, batch_size=4)
         self.assertEqual(output, [{"text": " A man said to the universe, Sir, I exist."}])
 
     @require_torch
@@ -1014,8 +1014,8 @@ def test_torch_speech_encoder_decoder(self):
         )
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        filename = ds[40]["file"]
-        output = speech_recognizer(filename)
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
         self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'})
 
     @slow
@@ -1032,13 +1032,11 @@ def test_simple_wav2vec2(self):
         self.assertEqual(output, {"text": ""})
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        filename = ds[40]["file"]
-        output = asr(filename)
+        audio = ds[40]["audio"]
+        output = asr(audio)
         self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
 
-        filename = ds[40]["file"]
-        with open(filename, "rb") as f:
-            data = f.read()
+        data = Audio().encode_example(ds[40]["audio"])["bytes"]
         output = asr(data)
         self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
 
@@ -1058,13 +1056,11 @@ def test_simple_s2t(self):
         self.assertEqual(output, {"text": "(Applausi)"})
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        filename = ds[40]["file"]
-        output = asr(filename)
+        audio = ds[40]["audio"]
+        output = asr(audio)
         self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
 
-        filename = ds[40]["file"]
-        with open(filename, "rb") as f:
-            data = f.read()
+        data = Audio().encode_example(ds[40]["audio"])["bytes"]
         output = asr(data)
         self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
 
@@ -1078,13 +1074,13 @@ def test_simple_whisper_asr(self):
             framework="pt",
         )
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        filename = ds[0]["file"]
-        output = speech_recognizer(filename)
+        audio = ds[0]["audio"]
+        output = speech_recognizer(audio)
         self.assertEqual(
             output,
             {"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
         )
-        output = speech_recognizer(filename, return_timestamps=True)
+        output = speech_recognizer(ds[0]["audio"], return_timestamps=True)
         self.assertEqual(
             output,
             {
@@ -1100,7 +1096,7 @@ def test_simple_whisper_asr(self):
             },
         )
         speech_recognizer.model.generation_config.alignment_heads = [[2, 2], [3, 0], [3, 2], [3, 3], [3, 4], [3, 5]]
-        output = speech_recognizer(filename, return_timestamps="word")
+        output = speech_recognizer(ds[0]["audio"], return_timestamps="word")
         # fmt: off
         self.assertEqual(
             output,
@@ -1135,7 +1131,7 @@ def test_simple_whisper_asr(self):
             "^Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
             "Use `return_timestamps='word'` or `return_timestamps=True` respectively.$",
         ):
-            _ = speech_recognizer(filename, return_timestamps="char")
+            _ = speech_recognizer(audio, return_timestamps="char")
 
     @slow
     @require_torch
@@ -1147,8 +1143,8 @@ def test_simple_whisper_translation(self):
             framework="pt",
         )
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        filename = ds[40]["file"]
-        output = speech_recognizer(filename)
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
         self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."})
 
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
@@ -1158,7 +1154,7 @@ def test_simple_whisper_translation(self):
         speech_recognizer_2 = AutomaticSpeechRecognitionPipeline(
             model=model, tokenizer=tokenizer, feature_extractor=feature_extractor
         )
-        output_2 = speech_recognizer_2(filename)
+        output_2 = speech_recognizer_2(ds[0]["audio"])
         self.assertEqual(output, output_2)
 
         # either use generate_kwargs or set the model's generation_config
@@ -1170,7 +1166,7 @@ def test_simple_whisper_translation(self):
             feature_extractor=feature_extractor,
             generate_kwargs={"task": "transcribe", "language": "<|it|>"},
         )
-        output_3 = speech_translator(filename)
+        output_3 = speech_translator(ds[0]["audio"])
         self.assertEqual(output_3, {"text": " Un uomo ha detto all'universo, Sir, esiste."})
 
     @slow
@@ -1182,10 +1178,10 @@ def test_whisper_language(self):
             framework="pt",
         )
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        filename = ds[0]["file"]
+        audio = ds[0]["audio"]
 
         # 1. English-only model compatible with no language argument
-        output = speech_recognizer(filename)
+        output = speech_recognizer(audio)
         self.assertEqual(
             output,
             {"text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."},
@@ -1197,7 +1193,7 @@ def test_whisper_language(self):
             "Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, "
             "pass `is_multilingual=True` to generate, or update the generation config.",
         ):
-            _ = speech_recognizer(filename, generate_kwargs={"language": "en"})
+            _ = speech_recognizer(ds[0]["audio"], generate_kwargs={"language": "en"})
 
         # 3. Multilingual model accepts language argument
         speech_recognizer = pipeline(
@@ -1205,7 +1201,7 @@ def test_whisper_language(self):
             model="openai/whisper-tiny",
             framework="pt",
         )
-        output = speech_recognizer(filename, generate_kwargs={"language": "en"})
+        output = speech_recognizer(ds[0]["audio"], generate_kwargs={"language": "en"})
         self.assertEqual(
             output,
             {"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."},
@@ -1315,8 +1311,8 @@ def test_xls_r_to_en(self):
         )
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        filename = ds[40]["file"]
-        output = speech_recognizer(filename)
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
         self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."})
 
     @slow
@@ -1331,8 +1327,8 @@ def test_xls_r_from_en(self):
         )
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        filename = ds[40]["file"]
-        output = speech_recognizer(filename)
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
         self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."})
 
     @slow
@@ -1348,9 +1344,8 @@ def test_speech_to_text_leveraged(self):
         )
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
-        filename = ds[40]["file"]
-
-        output = speech_recognizer(filename)
+        audio = ds[40]["audio"]
+        output = speech_recognizer(audio)
         self.assertEqual(output, {"text": "a man said to the universe sir i exist"})
 
     @slow
@@ -1561,6 +1556,7 @@ def test_whisper_longform(self):
             feature_extractor=processor.feature_extractor,
             max_new_tokens=128,
             device=torch_device,
+            return_timestamps=True,  # to allow longform generation
         )
 
         ds = load_dataset("distil-whisper/meanwhile", "default")["test"]

From fc83a4d45921150b4c23b68d08ac9ee946070149 Mon Sep 17 00:00:00 2001
From: Umar Butler <umar@umar.au>
Date: Thu, 19 Sep 2024 00:41:50 +1000
Subject: [PATCH 39/67] Added support for bfloat16 to zero-shot classification
 pipeline (#33554)

* Added support for bfloat16 to zero-shot classification pipeline

* Ensure support for TF.

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>

* Remove dependency on `torch`.

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>

---------

Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
---
 src/transformers/pipelines/zero_shot_classification.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py
index 9a600bc8ad0f..f4aee3341e30 100644
--- a/src/transformers/pipelines/zero_shot_classification.py
+++ b/src/transformers/pipelines/zero_shot_classification.py
@@ -239,7 +239,10 @@ def _forward(self, inputs):
     def postprocess(self, model_outputs, multi_label=False):
         candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
         sequences = [outputs["sequence"] for outputs in model_outputs]
-        logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
+        if self.framework == "pt":
+            logits = np.concatenate([output["logits"].float().numpy() for output in model_outputs])
+        else:
+            logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
         N = logits.shape[0]
         n = len(candidate_labels)
         num_sequences = N // n

From 7542fac2c7e5e5761fb0394b045a8e4d9168da1c Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Wed, 18 Sep 2024 15:43:06 +0100
Subject: [PATCH 40/67] =?UTF-8?q?Pipeline:=20no=20side-effects=20on=20`mod?=
 =?UTF-8?q?el.config`=20and=20`model.generation=5Fconfig`=20=F0=9F=94=AB?=
 =?UTF-8?q?=20=20(#33480)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../generation/configuration_utils.py         |  4 ++
 src/transformers/generation/utils.py          | 13 ++++---
 .../pipelines/automatic_speech_recognition.py |  4 ++
 src/transformers/pipelines/base.py            | 37 +++++++++++--------
 .../pipelines/document_question_answering.py  |  4 ++
 src/transformers/pipelines/image_to_text.py   |  4 ++
 .../pipelines/table_question_answering.py     |  4 ++
 .../pipelines/text2text_generation.py         | 11 ++++--
 src/transformers/pipelines/text_generation.py | 13 ++++---
 src/transformers/pipelines/text_to_audio.py   |  6 ++-
 .../pipelines/visual_question_answering.py    |  4 ++
 tests/pipelines/test_pipelines_common.py      | 26 +++++++++++++
 tests/utils/test_modeling_utils.py            | 32 ++++++++++++++++
 13 files changed, 132 insertions(+), 30 deletions(-)

diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index e2585b1b9ed4..5e9ac835c19d 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -1229,6 +1229,10 @@ def from_model_config(cls, model_config: PretrainedConfig) -> "GenerationConfig"
         """
         config_dict = model_config.to_dict()
         config_dict.pop("_from_model_config", None)
+
+        # Removes all `None` from the model config dict -- this lets the generation config defaults to take hold
+        config_dict = {key: value for key, value in config_dict.items() if value is not None}
+
         generation_config = cls.from_dict(config_dict, return_unused_kwargs=False, _from_model_config=True)
 
         # Special case: some models have generation attributes set in the decoder. Use them if still unset in the
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 019eb6c27f18..d8896f91267d 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1334,23 +1334,26 @@ def _prepare_generation_config(
             # the following conditions must be met
             # 1) the generation config must have been created from the model config (`_from_model_config` field);
             # 2) the generation config must have seen no modification since its creation (the hash is the same);
-            # 3) the user must have set generation parameters in the model config.
+            # 3) there are non-default generation parameters in the model config.
+            # 4) the user must have set new generation parameters in the model config.
             # NOTE: `torch.compile` can't compile `hash`, this legacy support is disabled with compilation.
             if (
                 not is_torchdynamo_compiling()
                 and self.generation_config._from_model_config  # 1)
                 and self.generation_config._original_object_hash == hash(self.generation_config)  # 2)
+                and len(self.config._get_non_default_generation_parameters()) > 0  # 3)
             ):
                 new_generation_config = GenerationConfig.from_model_config(self.config)
-                if new_generation_config != self.generation_config:  # 3)
+                if new_generation_config != self.generation_config:  # 4)
                     warnings.warn(
                         "You have modified the pretrained model configuration to control generation. This is a"
-                        " deprecated strategy to control generation and will be removed soon, in a future version."
+                        " deprecated strategy to control generation and will be removed in v5."
                         " Please use and modify the model generation configuration (see"
-                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
+                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
+                        UserWarning,
                     )
                     self.generation_config = new_generation_config
-            using_model_generation_config = True
+
             generation_config = self.generation_config
             using_model_generation_config = True
 
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index f3de341d8895..7c122bed5437 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -501,6 +501,10 @@ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
                     else:
                         generate_kwargs["num_frames"] = num_frames
 
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
             tokens = self.model.generate(
                 inputs=inputs,
                 attention_mask=attention_mask,
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 7db33ab5bd1a..40a91a0d484b 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
+import copy
 import csv
 import importlib
 import json
@@ -899,22 +900,26 @@ def __init__(
         ):
             self.model.to(self.device)
 
-        # Update config and generation_config with task specific parameters
-        task_specific_params = self.model.config.task_specific_params
-        if task_specific_params is not None and task in task_specific_params:
-            self.model.config.update(task_specific_params.get(task))
-            if self.model.can_generate():
-                self.model.generation_config.update(**task_specific_params.get(task))
-
-        # Pipelines calling `generate`: if the tokenizer has a pad token but the model doesn't, set it in the
-        # forward params so that `generate` is aware of the pad token.
-        if (
-            self.tokenizer is not None
-            and self.model.can_generate()
-            and self.tokenizer.pad_token_id is not None
-            and self.model.generation_config.pad_token_id is None
-        ):
-            self.model.generation_config.pad_token_id = self.tokenizer.pad_token_id
+        # If the model can generate, create a local generation config. This is done to avoid side-effects on the model
+        # as we apply local tweaks to the generation config.
+        if self.model.can_generate():
+            self.prefix = self.model.config.prefix if hasattr(self.model.config, "prefix") else None
+            self.generation_config = copy.deepcopy(self.model.generation_config)
+            # Update the generation config with task specific params if they exist
+            # NOTE: `prefix` is pipeline-specific and doesn't exist in the generation config.
+            task_specific_params = self.model.config.task_specific_params
+            if task_specific_params is not None and task in task_specific_params:
+                this_task_params = task_specific_params.get(task)
+                if "prefix" in this_task_params:
+                    self.prefix = this_task_params.pop("prefix")
+                self.generation_config.update(**this_task_params)
+            # If the tokenizer has a pad token but the model doesn't, set it so that `generate` is aware of it.
+            if (
+                self.tokenizer is not None
+                and self.tokenizer.pad_token_id is not None
+                and self.generation_config.pad_token_id is None
+            ):
+                self.generation_config.pad_token_id = self.tokenizer.pad_token_id
 
         self.call_count = 0
         self._batch_size = kwargs.pop("batch_size", None)
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index aa4fb48aae6a..9198f4322638 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -429,6 +429,10 @@ def _forward(self, model_inputs, **generate_kwargs):
         is_last = model_inputs.pop("is_last", False)
 
         if self.model_type == ModelType.VisionEncoderDecoder:
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
             model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
         else:
             model_outputs = self.model(**model_inputs)
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 88dce8e591ae..91d44c46d25c 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -181,6 +181,10 @@ def _forward(self, model_inputs, **generate_kwargs):
         ):
             model_inputs["input_ids"] = None
 
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
         # FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py`
         #  parse inputs. In the Tensorflow version, `generate` raises an error if we don't use `input_ids` whereas
         #  the PyTorch version matches it with `self.model.main_input_name` or `self.model.encoder.main_input_name`
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index 702a47b7c3cb..77c95432c721 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -385,6 +385,10 @@ def _forward(self, model_inputs, sequential=False, **generate_kwargs):
             else:
                 outputs = self.batch_inference(**model_inputs)
         else:
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
             outputs = self.model.generate(**model_inputs, **generate_kwargs)
         model_outputs = {"model_inputs": model_inputs, "table": table, "outputs": outputs}
         return model_outputs
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index 42d97f4d11b9..75ded8ac085c 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -115,7 +115,7 @@ def check_inputs(self, input_length: int, min_length: int, max_length: int):
         return True
 
     def _parse_and_tokenize(self, *args, truncation):
-        prefix = self.model.config.prefix if self.model.config.prefix is not None else ""
+        prefix = self.prefix if self.prefix is not None else ""
         if isinstance(args[0], list):
             if self.tokenizer.pad_token_id is None:
                 raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input")
@@ -185,9 +185,14 @@ def _forward(self, model_inputs, **generate_kwargs):
 
         self.check_inputs(
             input_length,
-            generate_kwargs.get("min_length", self.model.config.min_length),
-            generate_kwargs.get("max_length", self.model.config.max_length),
+            generate_kwargs.get("min_length", self.generation_config.min_length),
+            generate_kwargs.get("max_length", self.generation_config.max_length),
         )
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
         output_ids = self.model.generate(**model_inputs, **generate_kwargs)
         out_b = output_ids.shape[0]
         if self.framework == "pt":
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index 8bd1017ffc66..9bffca522d5f 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -103,8 +103,8 @@ def __init__(self, *args, **kwargs):
             # It also defines both some preprocess_kwargs and generate_kwargs
             # which is why we cannot put them in their respective methods.
             prefix = None
-            if self.model.config.prefix is not None:
-                prefix = self.model.config.prefix
+            if self.prefix is not None:
+                prefix = self.prefix
             if prefix is None and self.model.__class__.__name__ in [
                 "XLNetLMHeadModel",
                 "TransfoXLLMHeadModel",
@@ -316,7 +316,7 @@ def preprocess(
             if "max_new_tokens" in generate_kwargs:
                 new_tokens = generate_kwargs["max_new_tokens"]
             else:
-                new_tokens = generate_kwargs.get("max_length", self.model.config.max_length) - cur_len
+                new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len
                 if new_tokens < 0:
                     raise ValueError("We cannot infer how many new tokens are expected")
             if cur_len + new_tokens > self.tokenizer.model_max_length:
@@ -354,7 +354,7 @@ def _forward(self, model_inputs, **generate_kwargs):
                 and generate_kwargs["generation_config"].max_new_tokens is not None
             )
             if not has_max_new_tokens:
-                generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.model.config.max_length
+                generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length
                 generate_kwargs["max_length"] += prefix_length
             has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
                 "generation_config" in generate_kwargs
@@ -363,7 +363,10 @@ def _forward(self, model_inputs, **generate_kwargs):
             if not has_min_new_tokens and "min_length" in generate_kwargs:
                 generate_kwargs["min_length"] += prefix_length
 
-        # BS x SL
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
         generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
         out_b = generated_sequence.shape[0]
         if self.framework == "pt":
diff --git a/src/transformers/pipelines/text_to_audio.py b/src/transformers/pipelines/text_to_audio.py
index 81653f14d6d8..d17d18205920 100644
--- a/src/transformers/pipelines/text_to_audio.py
+++ b/src/transformers/pipelines/text_to_audio.py
@@ -111,7 +111,7 @@ def preprocess(self, text, **kwargs):
         if self.model.config.model_type == "bark":
             # bark Tokenizer is called with BarkProcessor which uses those kwargs
             new_kwargs = {
-                "max_length": self.model.generation_config.semantic_config.get("max_input_semantic_length", 256),
+                "max_length": self.generation_config.semantic_config.get("max_input_semantic_length", 256),
                 "add_special_tokens": False,
                 "return_attention_mask": True,
                 "return_token_type_ids": False,
@@ -137,6 +137,10 @@ def _forward(self, model_inputs, **kwargs):
             # we expect some kwargs to be additional tensors which need to be on the right device
             generate_kwargs = self._ensure_tensor_on_device(generate_kwargs, device=self.device)
 
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
             # generate_kwargs get priority over forward_params
             forward_params.update(generate_kwargs)
 
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index e5849cbdec19..89988c0cba2b 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -162,6 +162,10 @@ def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
 
     def _forward(self, model_inputs, **generate_kwargs):
         if self.model.can_generate():
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
             model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
         else:
             model_outputs = self.model(**model_inputs)
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index ea36ae5728d1..1fec4be3d95c 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -31,6 +31,7 @@
     AutoTokenizer,
     DistilBertForSequenceClassification,
     MaskGenerationPipeline,
+    T5ForConditionalGeneration,
     TextClassificationPipeline,
     TextGenerationPipeline,
     TFAutoModelForSequenceClassification,
@@ -234,6 +235,31 @@ def test_auto_model_pipeline_registration_from_local_dir(self):
 
             self.assertIsInstance(pipe, TextGenerationPipeline)  # Assert successful load
 
+    @require_torch
+    def test_pipeline_with_task_parameters_no_side_effects(self):
+        """
+        Regression test: certain pipeline flags, like `task`, modified the model configuration, causing unexpected
+        side-effects
+        """
+        # This checkpoint has task-specific parameters that will modify the behavior of the pipeline
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+        self.assertTrue(model.config.num_beams == 1)
+
+        # The task-specific parameters used to cause side-effects on `model.config` -- not anymore
+        pipe = pipeline(model=model, tokenizer=AutoTokenizer.from_pretrained("t5-small"), task="translation_en_to_de")
+        self.assertTrue(model.config.num_beams == 1)
+        self.assertTrue(model.generation_config.num_beams == 1)
+
+        # Under the hood: we now store a generation config in the pipeline. This generation config stores the
+        # task-specific paremeters.
+        self.assertTrue(pipe.generation_config.num_beams == 4)
+
+        # We can confirm that the task-specific parameters have an effect. (In this case, the default is `num_beams=1`,
+        # which would crash when `num_return_sequences=4` is passed.)
+        pipe("Hugging Face doesn't sell hugs.", num_return_sequences=4)
+        with self.assertRaises(ValueError):
+            pipe("Hugging Face doesn't sell hugs.", num_return_sequences=4, num_beams=1)
+
 
 @is_pipeline_test
 class PipelineScikitCompatTest(unittest.TestCase):
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index f78285fdb90d..2130ed4b7c88 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -1715,6 +1715,38 @@ def test_isin_mps_friendly(self):
             torch.equal(torch.isin(random_ids, random_test_tensor), isin_mps_friendly(random_ids, random_test_tensor))
         )
 
+    def test_save_and_load_config_with_custom_generation(self):
+        """
+        Regression test for the ability to save and load a config with a custom generation kwarg (i.e. a parameter
+        that gets moved to the generation config and reset on the model config)
+        """
+        model = T5ForConditionalGeneration.from_pretrained(TINY_T5)
+
+        # The default for `num_beams` is 1 and `early_stopping` is False
+        self.assertTrue(model.config.num_beams == 1)
+        self.assertTrue(model.config.early_stopping is False)
+
+        # When we save the model, this custom parameter should be moved to the generation config AND the model
+        # config should contain `None`
+        model.config.num_beams = 2
+        model.config.early_stopping = True
+        self.assertTrue(model.generation_config.num_beams == 1)  # unmodified generation config
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            new_model = T5ForConditionalGeneration.from_pretrained(tmp_dir)
+            # moved to generation config
+            self.assertTrue(new_model.generation_config.num_beams == 2)
+            self.assertTrue(new_model.generation_config.early_stopping is True)
+            # reset in the model config
+            self.assertTrue(new_model.config.num_beams is None)
+            self.assertTrue(new_model.config.early_stopping is None)
+
+            # Sanity check: We can run `generate` with the new model without any warnings
+            random_ids = torch.randint(0, 100, (1, 5))
+            with warnings.catch_warnings(record=True) as w:
+                new_model.generate(random_ids, max_new_tokens=3)
+            self.assertTrue(len(w) == 0)
+
 
 @slow
 @require_torch

From 8efc06ee1863bd6e34e8adb7b10901da87c66818 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Wed, 18 Sep 2024 15:57:39 +0100
Subject: [PATCH 41/67] Return attention mask in ASR pipeline to avoid warnings
 (#33509)

return attention mask in ASR pipeline
---
 .../pipelines/automatic_speech_recognition.py             | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 7c122bed5437..4301982f1e90 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -440,6 +440,7 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
                     truncation=False,
                     padding="longest",
                     return_tensors="pt",
+                    return_attention_mask=True,
                 )
             else:
                 if self.type == "seq2seq_whisper" and stride is None:
@@ -448,13 +449,16 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
                         sampling_rate=self.feature_extractor.sampling_rate,
                         return_tensors="pt",
                         return_token_timestamps=True,
+                        return_attention_mask=True,
                     )
                     extra["num_frames"] = processed.pop("num_frames")
                 else:
                     processed = self.feature_extractor(
-                        inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+                        inputs,
+                        sampling_rate=self.feature_extractor.sampling_rate,
+                        return_tensors="pt",
+                        return_attention_mask=True,
                     )
-
             if self.torch_dtype is not None:
                 processed = processed.to(dtype=self.torch_dtype)
             if stride is not None:

From 9db963aeed419c8379c6d6425186fec0bfb86908 Mon Sep 17 00:00:00 2001
From: Dominik Niedziela <99881522+dom-dziela@users.noreply.github.com>
Date: Wed, 18 Sep 2024 17:38:31 +0200
Subject: [PATCH 42/67] enforce original size to be a list (#33564)

* enforce original size to be a list

* formatting

* apply datatype change to unpad_image in llava_next
---
 src/transformers/models/llava_next/modeling_llava_next.py   | 6 ++++++
 .../models/llava_onevision/modeling_llava_onevision.py      | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index c1d1ca8c276d..bf76921090b2 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -123,6 +123,12 @@ def unpad_image(tensor, original_size):
     Returns:
         `torch.Tensor`: The unpadded image tensor.
     """
+    if not isinstance(original_size, (list, tuple)):
+        if not isinstance(original_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(
+                f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+            )
+        original_size = original_size.tolist()
     original_height, original_width = original_size
     current_height, current_width = tensor.shape[1:]
 
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index 697ea84fea50..d3200fb5193d 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -124,6 +124,12 @@ def unpad_image(tensor, original_size):
     Returns:
         `torch.Tensor`: The unpadded image tensor.
     """
+    if not isinstance(original_size, (list, tuple)):
+        if not isinstance(original_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(
+                f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+            )
+        original_size = original_size.tolist()
     original_height, original_width = original_size
     current_height, current_width = tensor.shape[1:]
 

From 7b1ce634cb16f86725826e427bf30f1276cc0e19 Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Wed, 18 Sep 2024 12:56:45 -0400
Subject: [PATCH 43/67] Improve compiled RT-DETR inference speed  (#33412)

* modify rt detr to improve inference times when compiled

* Remove redundant "to"

* Fix conditional lru_cache and missing shapes_list

* nit unnecessary list creation

* Fix compile error when ninja not available and custon kernel activated
---
 .../models/rt_detr/modeling_rt_detr.py        | 86 ++++++++++++++-----
 1 file changed, 64 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py
index ab83a81f5067..4e32434901cd 100644
--- a/src/transformers/models/rt_detr/modeling_rt_detr.py
+++ b/src/transformers/models/rt_detr/modeling_rt_detr.py
@@ -18,7 +18,7 @@
 import os
 import warnings
 from dataclasses import dataclass
-from functools import lru_cache, partial
+from functools import lru_cache, partial, wraps
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -737,7 +737,9 @@ def multi_scale_deformable_attention(
 ) -> Tensor:
     batch_size, _, num_heads, hidden_dim = value.shape
     _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
-    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    # Ignore copy
+    value_list = value.split([height * width for height, width in value_spatial_shapes], dim=1)
+
     sampling_grids = 2 * sampling_locations - 1
     sampling_value_list = []
     for level_id, (height, width) in enumerate(value_spatial_shapes):
@@ -849,6 +851,7 @@ def forward(
         position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         output_attentions: bool = False,
     ):
@@ -858,7 +861,10 @@ def forward(
 
         batch_size, num_queries, _ = hidden_states.shape
         batch_size, sequence_length, _ = encoder_hidden_states.shape
-        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+
+        # Ignore copy
+        total_elements = sum(shape[0] * shape[1] for shape in spatial_shapes_list)
+        if total_elements != sequence_length:
             raise ValueError(
                 "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
             )
@@ -893,9 +899,12 @@ def forward(
         else:
             raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
 
-        if self.disable_custom_kernels:
+        # Ignore copy
+        if self.disable_custom_kernels or MultiScaleDeformableAttention is None:
             # PyTorch implementation
-            output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+            output = multi_scale_deformable_attention(
+                value, spatial_shapes_list, sampling_locations, attention_weights
+            )
         else:
             try:
                 # custom kernel
@@ -909,7 +918,9 @@ def forward(
                 )
             except Exception:
                 # PyTorch implementation
-                output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+                output = multi_scale_deformable_attention(
+                    value, spatial_shapes_list, sampling_locations, attention_weights
+                )
         output = self.output_proj(output)
 
         return output, attention_weights
@@ -1064,6 +1075,7 @@ def forward(
         position_embeddings: Optional[torch.Tensor] = None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
@@ -1114,6 +1126,7 @@ def forward(
             position_embeddings=position_embeddings,
             reference_points=reference_points,
             spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
             level_start_index=level_start_index,
             output_attentions=output_attentions,
         )
@@ -1299,14 +1312,16 @@ def __init__(self, config: RTDetrConfig):
             self.pan_blocks.append(RTDetrCSPRepLayer(config))
 
     @staticmethod
-    def build_2d_sincos_position_embedding(width, height, embed_dim=256, temperature=10000.0):
-        grid_w = torch.arange(int(width), dtype=torch.float32)
-        grid_h = torch.arange(int(height), dtype=torch.float32)
+    def build_2d_sincos_position_embedding(
+        width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
+    ):
+        grid_w = torch.arange(int(width), dtype=dtype, device=device)
+        grid_h = torch.arange(int(height), dtype=dtype, device=device)
         grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
         if embed_dim % 4 != 0:
             raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding")
         pos_dim = embed_dim // 4
-        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+        omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim
         omega = 1.0 / (temperature**omega)
 
         out_w = grid_w.flatten()[..., None] @ omega[None]
@@ -1372,8 +1387,13 @@ def forward(
                 src_flatten = hidden_states[enc_ind].flatten(2).permute(0, 2, 1)
                 if self.training or self.eval_size is None:
                     pos_embed = self.build_2d_sincos_position_embedding(
-                        width, height, self.encoder_hidden_dim, self.positional_encoding_temperature
-                    ).to(src_flatten.device, src_flatten.dtype)
+                        width,
+                        height,
+                        self.encoder_hidden_dim,
+                        self.positional_encoding_temperature,
+                        device=src_flatten.device,
+                        dtype=src_flatten.dtype,
+                    )
                 else:
                     pos_embed = None
 
@@ -1441,6 +1461,7 @@ def forward(
         position_embeddings=None,
         reference_points=None,
         spatial_shapes=None,
+        spatial_shapes_list=None,
         level_start_index=None,
         valid_ratios=None,
         output_attentions=None,
@@ -1512,6 +1533,7 @@ def forward(
                 encoder_hidden_states=encoder_hidden_states,
                 reference_points=reference_points_input,
                 spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
                 level_start_index=level_start_index,
                 encoder_attention_mask=encoder_attention_mask,
                 output_attentions=output_attentions,
@@ -1575,6 +1597,27 @@ def forward(
         )
 
 
+def compile_compatible_lru_cache(*lru_args, **lru_kwargs):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            if not torch.compiler.is_compiling():
+                # Cache the function only if the model is not being compiled
+                # check if the function is already cached, otherwise create it
+                if not hasattr(self, f"_cached_{func.__name__}"):
+                    self.__setattr__(
+                        f"_cached_{func.__name__}", lru_cache(*lru_args, **lru_kwargs)(func.__get__(self))
+                    )
+                return self.__getattribute__(f"_cached_{func.__name__}")(*args, **kwargs)
+            else:
+                # Otherwise, just call the original function
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
 @add_start_docstrings(
     """
     RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top.
@@ -1626,7 +1669,7 @@ def __init__(self, config: RTDetrConfig):
 
         # init encoder output anchors and valid_mask
         if config.anchor_image_size:
-            self.anchors, self.valid_mask = self.generate_anchors()
+            self.anchors, self.valid_mask = self.generate_anchors(dtype=self.dtype)
 
         # Create decoder input projection layers
         # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412
@@ -1669,12 +1712,8 @@ def unfreeze_backbone(self):
         for param in self.backbone.parameters():
             param.requires_grad_(True)
 
-    @lru_cache(maxsize=32)
-    def generate_anchors(self, spatial_shapes=None, grid_size=0.05):
-        # We always generate anchors in float32 to preserve equivalence between
-        # dynamic and static anchor inference
-        dtype = torch.float32
-
+    @compile_compatible_lru_cache(maxsize=32)
+    def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32):
         if spatial_shapes is None:
             spatial_shapes = [
                 [int(self.config.anchor_image_size[0] / s), int(self.config.anchor_image_size[1] / s)]
@@ -1683,10 +1722,12 @@ def generate_anchors(self, spatial_shapes=None, grid_size=0.05):
         anchors = []
         for level, (height, width) in enumerate(spatial_shapes):
             grid_y, grid_x = torch.meshgrid(
-                torch.arange(end=height, dtype=dtype), torch.arange(end=width, dtype=dtype), indexing="ij"
+                torch.arange(end=height, dtype=dtype, device=device),
+                torch.arange(end=width, dtype=dtype, device=device),
+                indexing="ij",
             )
             grid_xy = torch.stack([grid_x, grid_y], -1)
-            valid_wh = torch.tensor([width, height]).to(dtype)
+            valid_wh = torch.tensor([width, height], device=device).to(dtype)
             grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_wh
             wh = torch.ones_like(grid_xy) * grid_size * (2.0**level)
             anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4))
@@ -1826,7 +1867,7 @@ def forward(
             # Pass spatial_shapes as tuple to make it hashable and make sure
             # lru_cache is working for generate_anchors()
             spatial_shapes_tuple = tuple(spatial_shapes_list)
-            anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple)
+            anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype)
         else:
             anchors, valid_mask = self.anchors, self.valid_mask
 
@@ -1873,6 +1914,7 @@ def forward(
             encoder_attention_mask=attention_mask,
             reference_points=init_reference_points,
             spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
             level_start_index=level_start_index,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,

From 6019f3ff7805f94b4bec1ad5fcf8c438ecb03ee6 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Wed, 18 Sep 2024 19:10:28 +0200
Subject: [PATCH 44/67] Fix bnb dequantization  (#33546)

---
 src/transformers/integrations/bitsandbytes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index c49d353ccb52..f37ca9a2650b 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -437,6 +437,7 @@ def _dequantize_and_replace(
 
                 new_module.to(device)
                 model._modules[name] = new_module
+                has_been_replaced = True
         if len(list(module.children())) > 0:
             _, has_been_replaced = _dequantize_and_replace(
                 module,

From 5af7d41e49bbfc8319f462eb45253dcb3863dfb7 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Wed, 18 Sep 2024 19:23:44 +0200
Subject: [PATCH 45/67] Codec integration (#33565)

* clean mimi commit

* some nits suggestions from Arthur

* make fixup

* rename repo id + change readme

* Update docs/source/en/model_doc/mimi.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* add flaky flag to batching equivalence due to audio_codes failing sometimes

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.md                       |    1 +
 docs/source/en/model_doc/mimi.md              |   69 +
 docs/source/en/perf_infer_gpu_one.md          |    2 +
 src/transformers/__init__.py                  |   14 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    2 +
 .../models/auto/feature_extraction_auto.py    |    1 +
 src/transformers/models/auto/modeling_auto.py |    1 +
 src/transformers/models/mimi/__init__.py      |   57 +
 .../models/mimi/configuration_mimi.py         |  234 +++
 .../convert_mimi_checkpoint_to_pytorch.py     |  198 ++
 src/transformers/models/mimi/modeling_mimi.py | 1722 +++++++++++++++++
 src/transformers/utils/dummy_pt_objects.py    |   14 +
 tests/models/mimi/__init__.py                 |    0
 tests/models/mimi/test_modeling_mimi.py       |  890 +++++++++
 16 files changed, 3208 insertions(+)
 create mode 100644 docs/source/en/model_doc/mimi.md
 create mode 100644 src/transformers/models/mimi/__init__.py
 create mode 100644 src/transformers/models/mimi/configuration_mimi.py
 create mode 100644 src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
 create mode 100644 src/transformers/models/mimi/modeling_mimi.py
 create mode 100644 tests/models/mimi/__init__.py
 create mode 100644 tests/models/mimi/test_modeling_mimi.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 7eff2a383026..59f0ff48d22a 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -722,6 +722,8 @@
         title: Hubert
       - local: model_doc/mctct
         title: MCTCT
+      - local: model_doc/mimi
+        title: Mimi
       - local: model_doc/mms
         title: MMS
       - local: model_doc/musicgen
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index c18426de4c03..cc5d7990929a 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -210,6 +210,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                 [Megatron-BERT](model_doc/megatron-bert)                 |       ✅        |         ❌         |      ❌      |
 |                 [Megatron-GPT2](model_doc/megatron_gpt2)                 |       ✅        |         ✅         |      ✅      |
 |                       [MGP-STR](model_doc/mgp-str)                       |       ✅        |         ❌         |      ❌      |
+|                          [Mimi](model_doc/mimi)                          |       ✅        |         ❌         |      ❌      |
 |                       [Mistral](model_doc/mistral)                       |       ✅        |         ✅         |      ✅      |
 |                       [Mixtral](model_doc/mixtral)                       |       ✅        |         ❌         |      ❌      |
 |                         [mLUKE](model_doc/mluke)                         |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/mimi.md b/docs/source/en/model_doc/mimi.md
new file mode 100644
index 000000000000..486d18363349
--- /dev/null
+++ b/docs/source/en/model_doc/mimi.md
@@ -0,0 +1,69 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Mimi
+
+## Overview
+
+The Mimi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour. Mimi is a high-fidelity audio codec model developed by the Kyutai team, that combines semantic and acoustic information into audio tokens running at 12Hz and a bitrate of 1.1kbps. In other words, it can be used to map audio waveforms into “audio tokens”, known as “codebooks”.
+
+The abstract from the paper is the following:
+
+*We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning— such as emotion or non-speech sounds— is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this “Inner Monologue” method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at github.com/kyutai-labs/moshi.* 
+
+Its architecture is based on [Encodec](model_doc/encodec) with several major differences:
+* it uses a much lower frame-rate.
+* it uses additional transformers for encoding and decoding for better latent contextualization
+* it uses a different quantization scheme: one codebook is dedicated to semantic projection.
+
+## Usage example 
+
+Here is a quick example of how to encode and decode an audio using this model:
+
+```python 
+>>> from datasets import load_dataset, Audio
+>>> from transformers import MimiModel, AutoFeatureExtractor
+>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+>>> # load model and feature extractor
+>>> model = MimiModel.from_pretrained("kyutai/mimi")
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")
+
+>>> # load audio sample
+>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
+>>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
+>>> inputs = feature_extractor(raw_audio=audio_sample, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
+
+>>> encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])
+>>> audio_values = model.decode(encoder_outputs.audio_codes, inputs["padding_mask"])[0]
+>>> # or the equivalent with a forward pass
+>>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
+```
+
+This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe).
+The original code can be found [here](https://github.com/kyutai-labs/moshi).
+
+
+## MimiConfig
+
+[[autodoc]] MimiConfig
+
+## MimiModel
+
+[[autodoc]] MimiModel
+    - decode
+    - encode
+    - forward
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index dd3433f2cd48..4c220dd0f148 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -61,6 +61,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
 * [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video)
 * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
+* [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
 * [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
 * [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava)
 * [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)
@@ -228,6 +229,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
 * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
 * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
+* [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
 * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
 * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
 * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index bfd0d37916b5..aa13a97fe461 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -573,6 +573,7 @@
         "MgpstrProcessor",
         "MgpstrTokenizer",
     ],
+    "models.mimi": ["MimiConfig"],
     "models.mistral": ["MistralConfig"],
     "models.mixtral": ["MixtralConfig"],
     "models.mluke": [],
@@ -2666,6 +2667,12 @@
             "MgpstrPreTrainedModel",
         ]
     )
+    _import_structure["models.mimi"].extend(
+        [
+            "MimiModel",
+            "MimiPreTrainedModel",
+        ]
+    )
     _import_structure["models.mistral"].extend(
         [
             "MistralForCausalLM",
@@ -5345,6 +5352,9 @@
         MgpstrProcessor,
         MgpstrTokenizer,
     )
+    from .models.mimi import (
+        MimiConfig,
+    )
     from .models.mistral import MistralConfig
     from .models.mixtral import MixtralConfig
     from .models.mobilebert import (
@@ -7212,6 +7222,10 @@
             MgpstrModel,
             MgpstrPreTrainedModel,
         )
+        from .models.mimi import (
+            MimiModel,
+            MimiPreTrainedModel,
+        )
         from .models.mistral import (
             MistralForCausalLM,
             MistralForSequenceClassification,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 2022048cd455..5b5d1e7902bd 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -149,6 +149,7 @@
     megatron_bert,
     megatron_gpt2,
     mgp_str,
+    mimi,
     mistral,
     mixtral,
     mluke,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2cd7d550d90b..5a6ec14e78cd 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -167,6 +167,7 @@
         ("mega", "MegaConfig"),
         ("megatron-bert", "MegatronBertConfig"),
         ("mgp-str", "MgpstrConfig"),
+        ("mimi", "MimiConfig"),
         ("mistral", "MistralConfig"),
         ("mixtral", "MixtralConfig"),
         ("mobilebert", "MobileBertConfig"),
@@ -468,6 +469,7 @@
         ("megatron-bert", "Megatron-BERT"),
         ("megatron_gpt2", "Megatron-GPT2"),
         ("mgp-str", "MGP-STR"),
+        ("mimi", "Mimi"),
         ("mistral", "Mistral"),
         ("mixtral", "Mixtral"),
         ("mluke", "mLUKE"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 7f335d66584f..dca0c08aa909 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -69,6 +69,7 @@
         ("levit", "LevitFeatureExtractor"),
         ("maskformer", "MaskFormerFeatureExtractor"),
         ("mctct", "MCTCTFeatureExtractor"),
+        ("mimi", "EncodecFeatureExtractor"),
         ("mobilenet_v1", "MobileNetV1FeatureExtractor"),
         ("mobilenet_v2", "MobileNetV2FeatureExtractor"),
         ("mobilevit", "MobileViTFeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index e0d15f1e2365..2bc71f07970a 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -158,6 +158,7 @@
         ("mega", "MegaModel"),
         ("megatron-bert", "MegatronBertModel"),
         ("mgp-str", "MgpstrForSceneTextRecognition"),
+        ("mimi", "MimiModel"),
         ("mistral", "MistralModel"),
         ("mixtral", "MixtralModel"),
         ("mobilebert", "MobileBertModel"),
diff --git a/src/transformers/models/mimi/__init__.py b/src/transformers/models/mimi/__init__.py
new file mode 100644
index 000000000000..43b2bec6caa5
--- /dev/null
+++ b/src/transformers/models/mimi/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_mimi": ["MimiConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mimi"] = [
+        "MimiModel",
+        "MimiPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_mimi import (
+        MimiConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mimi import (
+            MimiModel,
+            MimiPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py
new file mode 100644
index 000000000000..5564b1a54ba6
--- /dev/null
+++ b/src/transformers/models/mimi/configuration_mimi.py
@@ -0,0 +1,234 @@
+# coding=utf-8
+# Copyright 2024 Meta Platforms, Inc. and affiliates, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mimi model configuration"""
+
+import math
+
+import numpy as np
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MimiConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`MimiModel`]. It is used to instantiate a
+    Mimi model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [kyutai/mimi](https://huggingface.co/kyutai/mimi) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        sampling_rate (`int`, *optional*, defaults to 24000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+        frame_rate (`float`, *optional*, defaults to 12.5):
+            Framerate of the model.
+        audio_channels (`int`, *optional*, defaults to 1):
+            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
+        hidden_size (`int`, *optional*, defaults to 512):
+            Intermediate representation dimension.
+        num_filters (`int`, *optional*, defaults to 64):
+            Number of convolution kernels of first `MimiConv1d` down sampling layer.
+        num_residual_layers (`int`,  *optional*, defaults to 1):
+            Number of residual layers.
+        upsampling_ratios (`Sequence[int]`, *optional*):
+            Kernel size and stride ratios. The encoder uses downsampling ratios instead of upsampling ratios, hence it
+            will use the ratios in the reverse order to the ones specified here that must match the decoder order.
+            If not specified, will defaults to `[8, 6, 5, 4]`
+        kernel_size (`int`, *optional*, defaults to 7):
+            Kernel size for the initial convolution.
+        last_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size for the last convolution layer.
+        residual_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size for the residual layers.
+        dilation_growth_rate (`int`, *optional*, defaults to 2):
+            How much to increase the dilation with each layer.
+        use_causal_conv (`bool`, *optional*, defaults to `True`):
+            Whether to use fully causal convolution.
+        pad_mode (`str`, *optional*, defaults to `"constant"`):
+            Padding mode for the convolutions.
+        compress (`int`, *optional*, defaults to 2):
+            Reduced dimensionality in residual branches.
+        trim_right_ratio (`float`, *optional*, defaults to 1.0):
+            Ratio for trimming at the right of the transposed convolution under the `use_causal_conv = True` setup. If
+            equal to 1.0, it means that all the trimming is done at the right.
+        codebook_size (`int`, *optional*, defaults to 2048):
+            Number of discret codes in each codebooks.
+        codebook_dim (`int`, *optional*, defaults to 256):
+            Dimension of the unquantized codebook vectors. If not defined, uses `hidden_size`.
+        num_quantizers (`int`, *optional*, defaults to 32):
+            Number of quantizer channels, or codebooks, in the quantizer.
+        use_conv_shortcut (`bool`, *optional*, defaults to `False`):
+            Whether to use a convolutional layer as the 'skip' connection in the `MimiResnetBlock` block. If False,
+            an identity function will be used, giving a generic residual connection.
+        vector_quantization_hidden_dimension (`int`, *optional*, defaults to 256):
+            Intermediate representation dimension in the residual vector quantization space.
+        num_semantic_quantizers (`int`, *optional*, defaults to 1):
+            Number of semantic quantizer channels, or codebooks, in the semantic quantizer. Must be lower than `num_quantizers`.
+        upsample_groups (`int`, *optional*, defaults to 512):
+            If `frame_rate!=encodec_frame_rate`, indicates the number of groups used in the upsampling operation to go from one rate to another.
+        num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the Transformer models.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimension of the MLP representations.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 8000):
+            The maximum sequence length that this model might ever be used with. Mimi's sliding window attention
+            allows sequence of up to 8000 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the LayerNorm normalization layers.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*, defaults to 250):
+            Sliding window attention window size. If not specified, will default to `250`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        layer_scale_initial_scale (`float`, *optional*, defaults to 0.01):
+            Initiale scale of the residual rescaling operation done in the Transformer models.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+    Example:
+
+    ```python
+    >>> from transformers import MimiModel, MimiConfig
+
+    >>> # Initializing a "kyutai/mimi" style configuration
+    >>> configuration = MimiConfig()
+
+    >>> # Initializing a model (with random weights) from the "kyutai/mimi" style configuration
+    >>> model = MimiModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mimi"
+
+    def __init__(
+        self,
+        sampling_rate=24_000,
+        frame_rate=12.5,
+        audio_channels=1,
+        hidden_size=512,
+        num_filters=64,
+        num_residual_layers=1,
+        upsampling_ratios=None,
+        kernel_size=7,
+        last_kernel_size=3,
+        residual_kernel_size=3,
+        dilation_growth_rate=2,
+        use_causal_conv=True,
+        pad_mode="constant",
+        compress=2,
+        trim_right_ratio=1.0,
+        codebook_size=2048,
+        codebook_dim=256,
+        num_quantizers=32,
+        use_conv_shortcut=False,
+        vector_quantization_hidden_dimension=256,
+        num_semantic_quantizers=1,
+        upsample_groups=512,
+        num_hidden_layers=8,
+        intermediate_size=2048,
+        num_attention_heads=8,
+        num_key_value_heads=8,
+        head_dim=None,
+        hidden_act="gelu",
+        max_position_embeddings=8000,
+        initializer_range=0.02,
+        norm_eps=1e-5,
+        use_cache=False,
+        rope_theta=10000.0,
+        sliding_window=250,
+        attention_dropout=0.0,
+        layer_scale_initial_scale=0.01,
+        attention_bias=False,
+        **kwargs,
+    ):
+        self.sampling_rate = sampling_rate
+        self.frame_rate = frame_rate
+        self.audio_channels = audio_channels
+        self.hidden_size = hidden_size
+        self.num_filters = num_filters
+        self.num_residual_layers = num_residual_layers
+        self.upsampling_ratios = upsampling_ratios if upsampling_ratios else [8, 6, 5, 4]
+        self.kernel_size = kernel_size
+        self.last_kernel_size = last_kernel_size
+        self.residual_kernel_size = residual_kernel_size
+        self.dilation_growth_rate = dilation_growth_rate
+        self.use_causal_conv = use_causal_conv
+        self.pad_mode = pad_mode
+        self.compress = compress
+        self.trim_right_ratio = trim_right_ratio
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim if codebook_dim is not None else hidden_size
+        self.num_quantizers = num_quantizers
+        self.use_conv_shortcut = use_conv_shortcut
+        self.vector_quantization_hidden_dimension = vector_quantization_hidden_dimension
+        self.upsample_groups = upsample_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.sliding_window = sliding_window
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim or hidden_size // num_attention_heads
+        self.layer_scale_initial_scale = layer_scale_initial_scale
+        self.attention_bias = attention_bias
+
+        if num_semantic_quantizers >= self.num_quantizers:
+            raise ValueError(
+                f"The number of semantic quantizers should be lower than the total number of quantizers {self.num_quantizers}, but is currently {num_semantic_quantizers}."
+            )
+        self.num_semantic_quantizers = num_semantic_quantizers
+        super().__init__(**kwargs)
+
+    @property
+    def encodec_frame_rate(self) -> int:
+        hop_length = np.prod(self.upsampling_ratios)
+        return math.ceil(self.sampling_rate / hop_length)
+
+    @property
+    def num_codebooks(self) -> int:
+        # alias to num_quantizers
+        return self.num_quantizers
diff --git a/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py b/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..c617fa036c5d
--- /dev/null
+++ b/src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Mimi checkpoints."""
+
+import argparse
+
+import safetensors
+import torch
+
+from transformers import (
+    EncodecFeatureExtractor,
+    MimiConfig,
+    MimiModel,
+    logging,
+)
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.mimi")
+
+
+def assert_param_count(model_1, model_2):
+    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
+    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
+    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
+
+
+def param_count(model):
+    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
+
+
+def _grab_best_device(use_gpu=True):
+    if torch.cuda.device_count() > 0 and use_gpu:
+        device = "cuda"
+    else:
+        device = "cpu"
+    return torch.device(device)
+
+
+convert_list = [
+    # GENERAL
+    ("conv.conv.conv", "conv"),
+    ("convtr.convtr.convtr", "conv"),
+    ("conv.conv", "conv"),
+    ("convtr.convtr", "conv"),
+    # QUANTIZER
+    ("quantizer.rvq_first.vq", "quantizer.semantic_residual_vector_quantizer"),
+    ("quantizer.rvq_first", "quantizer.semantic_residual_vector_quantizer"),
+    ("quantizer.rvq_rest.vq", "quantizer.acoustic_residual_vector_quantizer"),
+    ("quantizer.rvq_rest", "quantizer.acoustic_residual_vector_quantizer"),
+    ("_codebook", "codebook"),
+    ("_initialized", "initialized"),
+    ("embedding_sum", "embed_sum"),
+    # ENCODER PART
+    ("encoder.model", "encoder.layers"),
+    ("decoder.model", "decoder.layers"),
+    # TRANSFORMERS PART
+    ("encoder_transformer.transformer", "encoder_transformer"),
+    ("decoder_transformer.transformer", "decoder_transformer"),
+    ("linear1", "mlp.fc1"),
+    ("linear2", "mlp.fc2"),
+    ("self_attn.out_proj", "self_attn.o_proj"),
+    ("norm1", "input_layernorm"),
+    ("norm2", "post_attention_layernorm"),
+    ("layer_scale_1", "self_attn_layer_scale"),
+    ("layer_scale_2", "mlp_layer_scale"),
+]
+
+
+def _convert_model(
+    state_dict,
+    hf_model,
+    convert_list,
+    device,
+    config,
+    unwanted_prefix=None,
+):
+    hidden_size = config.hidden_size
+    head_dim = config.head_dim
+    num_heads = int(config.hidden_size // config.head_dim)
+    num_key_value_heads = config.num_key_value_heads
+    key_value_head_dim = config.num_key_value_heads * head_dim
+
+    # permute for sliced rotary
+    def permute(w, n_heads, dim1=hidden_size, dim2=hidden_size):
+        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+    for k, v in list(state_dict.items()):
+        new_k = k if unwanted_prefix is None else k[len(unwanted_prefix) :]
+        for old_layer_name, new_layer_name in convert_list:
+            if old_layer_name in new_k:
+                new_k = new_k.replace(old_layer_name, new_layer_name)
+
+        if "in_proj_weight" in new_k:
+            # split qkv into query key and value
+            mixed_qkv = state_dict.pop(k)
+            qkv_dim = mixed_qkv.size(0) // 3
+
+            query_layer = mixed_qkv[:qkv_dim]
+            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
+            value_layer = mixed_qkv[qkv_dim * 2 :]
+
+            state_dict[new_k.replace("in_proj_weight", "q_proj.weight")] = permute(query_layer, num_heads)
+            state_dict[new_k.replace("in_proj_weight", "k_proj.weight")] = permute(
+                key_layer, num_key_value_heads, dim1=key_value_head_dim
+            )
+            state_dict[new_k.replace("in_proj_weight", "v_proj.weight")] = value_layer
+        else:
+            state_dict[new_k] = state_dict.pop(k)
+
+    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
+    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
+    if len(extra_keys) != 0:
+        raise ValueError(f"extra keys found: {extra_keys}")
+    if len(missing_keys) != 0:
+        raise ValueError(f"missing keys: {missing_keys}")
+    hf_model.load_state_dict(state_dict, strict=True)
+    n_params = param_count(hf_model)
+
+    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
+
+    hf_model.eval()
+    hf_model.to(device)
+    del state_dict
+
+    return hf_model
+
+
+@torch.no_grad()
+def convert_checkpoint(
+    checkpoint_path,
+    pytorch_dump_folder_path,
+    config_path=None,
+    repo_id=None,
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    device = _grab_best_device()
+
+    if config_path is not None:
+        config = MimiConfig.from_pretrained(config_path)
+    else:
+        config = MimiConfig()
+
+    model = MimiModel(config)
+
+    feature_extractor = EncodecFeatureExtractor(
+        feature_size=config.audio_channels,
+        sampling_rate=config.sampling_rate,
+    )
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+    original_checkpoint = safetensors.torch.load_file(checkpoint_path)
+    if "best_state" in original_checkpoint:
+        # we might have a training state saved, in which case discard the yaml results and just retain the weights
+        original_checkpoint = original_checkpoint["best_state"]
+
+    model = _convert_model(original_checkpoint, model, convert_list, device, config)
+
+    model.save_pretrained(pytorch_dump_folder_path)
+
+    if repo_id:
+        print("Pushing to the hub...")
+        feature_extractor.push_to_hub(repo_id)
+        model.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_checkpoint(
+        args.checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.config_path,
+        args.push_to_hub,
+    )
diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py
new file mode 100644
index 000000000000..db36250b3d89
--- /dev/null
+++ b/src/transformers/models/mimi/modeling_mimi.py
@@ -0,0 +1,1722 @@
+# coding=utf-8
+# Copyright 2024 Kyutai, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Mimi model."""
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_mimi import MimiConfig
+
+
+if is_flash_attn_2_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+logger = logging.get_logger(__name__)
+
+
+# General docstring
+_CONFIG_FOR_DOC = "MimiConfig"
+
+
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+
+    return causal_mask
+
+
+@dataclass
+class MimiOutput(ModelOutput):
+    """
+    Args:
+        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+            Discret code embeddings computed using `model.encode`.
+        audio_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*)
+            Decoded audio values, obtained using the decoder part of Mimi.
+        encoder_past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            The model will output the same cache format that is fed as input.
+
+            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+            have their past key value states given to this model).
+        decoder_past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            The model will output the same cache format that is fed as input.
+
+            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+            have their past key value states given to this model).
+    """
+
+    audio_codes: torch.LongTensor = None
+    audio_values: torch.FloatTensor = None
+    encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+    decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+
+
+@dataclass
+class MimiEncoderOutput(ModelOutput):
+    """
+    Args:
+        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+            Discret code embeddings computed using `model.encode`.
+        encoder_past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            The model will output the same cache format that is fed as input.
+
+            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+            have their past key value states given to this model).
+    """
+
+    audio_codes: torch.LongTensor = None
+    encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+
+
+@dataclass
+class MimiDecoderOutput(ModelOutput):
+    """
+    Args:
+        audio_values (`torch.FloatTensor`  of shape `(batch_size, segment_length)`, *optional*):
+            Decoded audio values, obtained using the decoder part of Mimi.
+        decoder_past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            The model will output the same cache format that is fed as input.
+
+            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+            have their past key value states given to this model).
+    """
+
+    audio_values: torch.FloatTensor = None
+    decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None
+
+
+class MimiConv1d(nn.Module):
+    """Conv1d with asymmetric or causal padding and normalization."""
+
+    def __init__(
+        self,
+        config,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        pad_mode=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.causal = config.use_causal_conv
+        self.pad_mode = config.pad_mode if pad_mode is None else pad_mode
+
+        # warn user on unusual setup between dilation and stride
+        if stride > 1 and dilation > 1:
+            logger.warning(
+                "MimiConv1d has been initialized with stride > 1 and dilation > 1"
+                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+            )
+
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, stride, dilation=dilation, groups=groups, bias=bias
+        )
+
+        kernel_size = self.conv.kernel_size[0]
+        stride = torch.tensor(self.conv.stride[0], dtype=torch.int64)
+        dilation = self.conv.dilation[0]
+
+        # Effective kernel size with dilations.
+        kernel_size = torch.tensor((kernel_size - 1) * dilation + 1, dtype=torch.int64)
+
+        self.register_buffer("stride", stride, persistent=False)
+        self.register_buffer("kernel_size", kernel_size, persistent=False)
+        self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
+
+        # Asymmetric padding required for odd strides
+        self.padding_right = self.padding_total // 2
+        self.padding_left = self.padding_total - self.padding_right
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.conv)
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv)
+
+    # Copied from transformers.models.encodec.modeling_encodec.EncodecConv1d._get_extra_padding_for_conv1d
+    def _get_extra_padding_for_conv1d(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        """See `pad_for_conv1d`."""
+        length = hidden_states.shape[-1]
+        n_frames = (length - self.kernel_size + self.padding_total) / self.stride + 1
+        n_frames = torch.ceil(n_frames).to(torch.int64) - 1
+        ideal_length = n_frames * self.stride + self.kernel_size - self.padding_total
+
+        return ideal_length - length
+
+    @staticmethod
+    # Copied from transformers.models.encodec.modeling_encodec.EncodecConv1d._pad1d
+    def _pad1d(hidden_states: torch.Tensor, paddings: Tuple[int, int], mode: str = "zero", value: float = 0.0):
+        """Tiny wrapper around torch.nn.functional.pad, just to allow for reflect padding on small input.
+        If this is the case, we insert extra 0 padding to the right before the reflection happens.
+        """
+        length = hidden_states.shape[-1]
+        padding_left, padding_right = paddings
+        if not mode == "reflect":
+            return nn.functional.pad(hidden_states, paddings, mode, value)
+
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            hidden_states = nn.functional.pad(hidden_states, (0, extra_pad))
+        padded = nn.functional.pad(hidden_states, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+
+    def forward(self, hidden_states):
+        extra_padding = self._get_extra_padding_for_conv1d(hidden_states)
+
+        if self.causal:
+            # Left padding for causal
+            hidden_states = self._pad1d(hidden_states, (self.padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            hidden_states = self._pad1d(
+                hidden_states, (self.padding_left, self.padding_right + extra_padding), mode=self.pad_mode
+            )
+
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class MimiConvTranspose1d(nn.Module):
+    """ConvTranspose1d with asymmetric or causal padding and normalization."""
+
+    def __init__(
+        self,
+        config,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        bias=True,
+    ):
+        super().__init__()
+        self.causal = config.use_causal_conv
+        self.trim_right_ratio = config.trim_right_ratio
+        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride, groups=groups, bias=bias)
+
+        if not (self.causal or self.trim_right_ratio == 1.0):
+            raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")
+
+        kernel_size = self.conv.kernel_size[0]
+        stride = self.conv.stride[0]
+        padding_total = kernel_size - stride
+
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            self.padding_right = math.ceil(padding_total * self.trim_right_ratio)
+        else:
+            # Asymmetric padding required for odd strides
+            self.padding_right = padding_total // 2
+
+        self.padding_left = padding_total - self.padding_right
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.conv)
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        # unpad
+        end = hidden_states.shape[-1] - self.padding_right
+        hidden_states = hidden_states[..., self.padding_left : end]
+        return hidden_states
+
+
+# Copied from transformers.models.encodec.modeling_encodec.EncodecResnetBlock with Encodec->Mimi,EnCodec->Mimi
+class MimiResnetBlock(nn.Module):
+    """
+    Residual block from SEANet model as used by Mimi.
+    """
+
+    def __init__(self, config: MimiConfig, dim: int, dilations: List[int]):
+        super().__init__()
+        kernel_sizes = (config.residual_kernel_size, 1)
+        if len(kernel_sizes) != len(dilations):
+            raise ValueError("Number of kernel sizes should match number of dilations")
+
+        hidden = dim // config.compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [nn.ELU()]
+            block += [MimiConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]
+        self.block = nn.ModuleList(block)
+
+        if config.use_conv_shortcut:
+            self.shortcut = MimiConv1d(config, dim, dim, kernel_size=1)
+        else:
+            self.shortcut = nn.Identity()
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        for layer in self.block:
+            hidden_states = layer(hidden_states)
+
+        return self.shortcut(residual) + hidden_states
+
+
+class MimiEncoder(nn.Module):
+    """SEANet encoder as used by Mimi."""
+
+    def __init__(self, config: MimiConfig):
+        super().__init__()
+        model = [MimiConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
+        scaling = 1
+
+        # Downsample to raw audio scale
+        for ratio in reversed(config.upsampling_ratios):
+            current_scale = scaling * config.num_filters
+            # Add residual layers
+            for j in range(config.num_residual_layers):
+                model += [MimiResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
+            # Add downsampling layers
+            model += [nn.ELU()]
+            model += [MimiConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
+            scaling *= 2
+
+        model += [nn.ELU()]
+        model += [MimiConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]
+
+        self.layers = nn.ModuleList(model)
+
+    # Copied from transformers.models.encodec.modeling_encodec.EncodecEncoder.forward
+    def forward(self, hidden_states):
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class MimiLayerScale(nn.Module):
+    """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonally the residual outputs close to 0, with a learnt scale.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        channels = config.hidden_size
+        initial_scale = config.layer_scale_initial_scale
+        self.scale = nn.Parameter(torch.full((channels,), initial_scale, requires_grad=True))
+
+    def forward(self, x: torch.Tensor):
+        return self.scale * x
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mimi
+class MimiRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    @torch.no_grad()
+    # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward
+    # TODO(joao): add me back asap :)
+    def forward(self, x, position_ids):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class MimiMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPMLP.forward
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaAttention with Gemma->Mimi
+class MimiAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: MimiConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.scaling = 1 / math.sqrt(config.head_dim)
+
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self.rotary_emb = MimiRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.sliding_window = config.sliding_window  # Ignore copy
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaFlashAttention2 with Gemma->Mimi
+class MimiFlashAttention2(MimiAttention):
+    """
+    Mimi flash attention module. This module inherits from `MimiAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (MimiRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Copied from transformers.models.gemma.modeling_gemma.GemmaSdpaAttention with Gemma->Mimi
+class MimiSdpaAttention(MimiAttention):
+    """
+    Mimi attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `MimiAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from MimiAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "MimiModel is using MimiSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+MIMI_ATTENTION_CLASSES = {
+    "eager": MimiAttention,
+    "flash_attention_2": MimiFlashAttention2,
+    "sdpa": MimiSdpaAttention,
+}
+
+
+class MimiTransformerLayer(nn.Module):
+    def __init__(self, config: MimiConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = MIMI_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+        self.mlp = MimiMLP(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+        self.self_attn_layer_scale = MimiLayerScale(config)
+        self.mlp_layer_scale = MimiLayerScale(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + self.self_attn_layer_scale(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.mlp_layer_scale(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class MimiTransformerModel(nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MimiTransformerLayer`]
+
+    Args:
+        config: MimiConfig
+    """
+
+    def __init__(self, config: MimiConfig):
+        super().__init__()
+
+        self.layers = nn.ModuleList(
+            [MimiTransformerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+
+        self.gradient_checkpointing = False
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Embedded representation that will be contextualized by the model
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+                `past_key_values`).
+
+                If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+                and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+                information on the default strategy.
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.n_positions - 1]`.
+
+                [What are position IDs?](../glossary#position-ids)
+            past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+                Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+                blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+                returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+                Two formats are allowed:
+                - a [`~cache_utils.Cache`] instance;
+                - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+                cache format.
+
+                The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+                legacy cache format will be returned.
+
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+                have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+                of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+                `past_key_values`).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if use_cache and past_key_values is None and not self.training:
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + hidden_states.shape[1], device=hidden_states.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = None
+        if attention_mask is not None:
+            causal_mask = self._update_causal_mask(
+                attention_mask, hidden_states, cache_position, past_key_values, output_attentions
+            )
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    # Copied from transformers.models.gemma.modeling_gemma.GemmaModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            min_dtype=min_dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+
+class MimiDecoder(nn.Module):
+    """SEANet decoder as used by Mimi."""
+
+    def __init__(self, config: MimiConfig):
+        super().__init__()
+        scaling = int(2 ** len(config.upsampling_ratios))
+        model = [MimiConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]
+
+        # Upsample to raw audio scale
+        for ratio in config.upsampling_ratios:
+            current_scale = scaling * config.num_filters
+            # Add upsampling layers
+            model += [nn.ELU()]
+            model += [
+                MimiConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
+            ]
+            # Add residual layers
+            for j in range(config.num_residual_layers):
+                model += [MimiResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
+            scaling //= 2
+
+        # Add final layers
+        model += [nn.ELU()]
+        model += [MimiConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
+        self.layers = nn.ModuleList(model)
+
+    # Copied from transformers.models.encodec.modeling_encodec.EncodecDecoder.forward
+    def forward(self, hidden_states):
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+        return hidden_states
+
+
+class MimiEuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance."""
+
+    def __init__(self, config: MimiConfig, epsilon: float = 1e-5):
+        super().__init__()
+        embed = torch.zeros(config.codebook_size, config.codebook_dim)
+
+        self.codebook_size = config.codebook_size
+
+        self.register_buffer("initialized", torch.Tensor([True]))
+        self.register_buffer("cluster_usage", torch.ones(config.codebook_size))
+        self.register_buffer("embed_sum", embed)
+        self._embed = None
+        self.epsilon = epsilon
+
+    @property
+    def embed(self) -> torch.Tensor:
+        if self._embed is None:
+            self._embed = self.embed_sum / self.cluster_usage.clamp(min=self.epsilon)[:, None]
+        return self._embed
+
+    def quantize(self, hidden_states):
+        # Projects each vector in `hidden_states` over the nearest centroid and return its index.
+        # `hidden_states` should be `[N, D]` with `N` the number of input vectors and `D` the dimension.
+        dists = torch.cdist(hidden_states[None], self.embed[None], p=2)[0]
+        embed_ind = dists.argmin(dim=-1)
+        return embed_ind
+
+    # Copied from transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook.encode
+    def encode(self, hidden_states):
+        shape = hidden_states.shape
+        # pre-process
+        hidden_states = hidden_states.reshape((-1, shape[-1]))
+        # quantize
+        embed_ind = self.quantize(hidden_states)
+        # post-process
+        embed_ind = embed_ind.view(*shape[:-1])
+        return embed_ind
+
+    # Copied from transformers.models.encodec.modeling_encodec.EncodecEuclideanCodebook.decode
+    def decode(self, embed_ind):
+        quantize = nn.functional.embedding(embed_ind, self.embed)
+        return quantize
+
+
+# Copied from transformers.models.encodec.modeling_encodec.EncodecVectorQuantization with Encodec->Mimi
+class MimiVectorQuantization(nn.Module):
+    """
+    Vector quantization implementation. Currently supports only euclidean distance.
+    """
+
+    def __init__(self, config: MimiConfig):
+        super().__init__()
+        self.codebook = MimiEuclideanCodebook(config)
+
+    def encode(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1)
+        embed_in = self.codebook.encode(hidden_states)
+        return embed_in
+
+    def decode(self, embed_ind):
+        quantize = self.codebook.decode(embed_ind)
+        quantize = quantize.permute(0, 2, 1)
+        return quantize
+
+
+class MimiResidualVectorQuantizer(nn.Module):
+    """Residual Vector Quantizer."""
+
+    def __init__(self, config: MimiConfig, num_quantizers: int = None):
+        super().__init__()
+        self.codebook_size = config.codebook_size
+        self.frame_rate = config.frame_rate
+        self.num_quantizers = num_quantizers if num_quantizers is not None else config.num_quantizers
+        self.layers = nn.ModuleList([MimiVectorQuantization(config) for _ in range(self.num_quantizers)])
+
+        self.input_proj = None
+        self.output_proj = None
+        if config.vector_quantization_hidden_dimension != config.hidden_size:
+            self.input_proj = torch.nn.Conv1d(
+                config.hidden_size, config.vector_quantization_hidden_dimension, 1, bias=False
+            )
+            self.output_proj = torch.nn.Conv1d(
+                config.vector_quantization_hidden_dimension, config.hidden_size, 1, bias=False
+            )
+
+    def encode(self, embeddings: torch.Tensor, num_quantizers: Optional[int] = None) -> torch.Tensor:
+        """
+        Encode a given input tensor with the specified frame rate at the given number of quantizers / codebooks. The RVQ encode method sets
+        the appropriate number of quantizers to use and returns indices for each quantizer.
+        """
+        if self.input_proj is not None:
+            embeddings = self.input_proj(embeddings)
+
+        num_quantizers = num_quantizers if num_quantizers is not None else self.num_quantizers
+
+        residual = embeddings
+        all_indices = []
+        for layer in self.layers[:num_quantizers]:
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes of shape [B, K, T] to the quantized representation."""
+        quantized_out = torch.tensor(0.0, device=codes.device)
+        codes = codes.transpose(0, 1)
+        for i, indices in enumerate(codes):
+            layer = self.layers[i]
+            quantized = layer.decode(indices)
+            quantized_out = quantized_out + quantized
+
+        if self.output_proj is not None:
+            quantized_out = self.output_proj(quantized_out)
+        return quantized_out
+
+
+class MimiSplitResidualVectorQuantizer(nn.Module):
+    """Split Residual Vector Quantizer."""
+
+    def __init__(self, config: MimiConfig):
+        super().__init__()
+        self.codebook_size = config.codebook_size
+        self.frame_rate = config.frame_rate
+        self.max_num_quantizers = config.num_quantizers
+
+        self.num_semantic_quantizers = config.num_semantic_quantizers
+        self.num_acoustic_quantizers = config.num_quantizers - config.num_semantic_quantizers
+
+        self.semantic_residual_vector_quantizer = MimiResidualVectorQuantizer(config, self.num_semantic_quantizers)
+        self.acoustic_residual_vector_quantizer = MimiResidualVectorQuantizer(config, self.num_acoustic_quantizers)
+
+    def encode(self, embeddings: torch.Tensor, num_quantizers: Optional[float] = None) -> torch.Tensor:
+        """
+        Encode a given input tensor with the specified frame rate at the given number of quantizers / codebooks. The RVQ encode method sets
+        the appropriate number of quantizers to use and returns indices for each quantizer.
+        """
+
+        num_quantizers = self.max_num_quantizers if num_quantizers is None else num_quantizers
+
+        if num_quantizers > self.max_num_quantizers:
+            raise ValueError(
+                f"The number of quantizers (i.e codebooks) asked should be lower than the total number of quantizers {self.max_num_quantizers}, but is currently {num_quantizers}."
+            )
+
+        if num_quantizers < self.num_semantic_quantizers:
+            raise ValueError(
+                f"The number of quantizers (i.e codebooks) asked should be higher than the number of semantic quantizers {self.num_semantic_quantizers}, but is currently {num_quantizers}."
+            )
+
+        # codes is [K, B, T], with T frames, K nb of codebooks.
+        codes = self.semantic_residual_vector_quantizer.encode(embeddings)
+
+        if num_quantizers > self.num_semantic_quantizers:
+            acoustic_codes = self.acoustic_residual_vector_quantizer.encode(
+                embeddings, num_quantizers=num_quantizers - self.num_semantic_quantizers
+            )
+            codes = torch.cat([codes, acoustic_codes], dim=0)
+
+        return codes
+
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+
+        # The first num_semantic_quantizers codebooks are decoded using the semantic RVQ
+        quantized_out = self.semantic_residual_vector_quantizer.decode(codes[:, : self.num_semantic_quantizers])
+
+        # The rest of the codebooks are decoded using the acoustic RVQ
+        if codes.shape[1] > self.num_semantic_quantizers:
+            quantized_out += self.acoustic_residual_vector_quantizer.decode(codes[:, self.num_semantic_quantizers :])
+        return quantized_out
+
+
+class MimiPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MimiConfig
+    base_model_prefix = "mimi"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MimiDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+
+    # Copied from transformers.models.encodec.modeling_encodec.EncodecPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LSTM):
+            for name, param in module.named_parameters():
+                if "weight" in name:
+                    nn.init.xavier_uniform_(param)
+                elif "bias" in name:
+                    nn.init.constant_(param, 0.0)
+
+
+MIMI_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MimiConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+MIMI_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
+            Raw audio input converted to Float.
+        padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+            for *masked*.
+        num_quantizers (`int`, *optional*):
+            Number of quantizers (i.e codebooks) to use. By default, all quantizers are used.
+        audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+            Discret code embeddings computed using `model.encode`.
+        encoder_past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            The model will output the same cache format that is fed as input.
+
+            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+            have their past key value states given to this model).
+        decoder_past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+            This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            The model will output the same cache format that is fed as input.
+
+            If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+            have their past key value states given to this model).
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The Mimi neural audio codec model.",
+    MIMI_START_DOCSTRING,
+)
+class MimiModel(MimiPreTrainedModel):
+    def __init__(self, config: MimiConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder = MimiEncoder(config)
+        self.encoder_transformer = MimiTransformerModel(config)
+
+        self.downsample = None
+        self.upsample = None
+        if config.frame_rate != config.encodec_frame_rate:
+            self.downsample = MimiConv1d(
+                config,
+                config.hidden_size,
+                config.hidden_size,
+                kernel_size=2 * int(config.encodec_frame_rate / config.frame_rate),
+                stride=2,
+                bias=False,
+                pad_mode="replicate",
+            )
+
+            self.upsample = MimiConvTranspose1d(
+                config,
+                config.hidden_size,
+                config.hidden_size,
+                kernel_size=2 * int(config.encodec_frame_rate / config.frame_rate),
+                stride=2,
+                bias=False,
+                groups=config.upsample_groups,
+            )
+
+        self.decoder_transformer = MimiTransformerModel(config)
+        self.decoder = MimiDecoder(config)
+
+        self.quantizer = MimiSplitResidualVectorQuantizer(config)
+
+        self.bits_per_codebook = int(math.log2(self.config.codebook_size))
+        if 2**self.bits_per_codebook != self.config.codebook_size:
+            raise ValueError("The codebook_size must be a power of 2.")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def _encode_frame(
+        self,
+        input_values: torch.Tensor,
+        num_quantizers: int,
+        padding_mask: int,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Encodes the given input using the underlying VQVAE. The padding mask is required to compute the correct scale.
+        """
+        embeddings = self.encoder(input_values)
+        encoder_outputs = self.encoder_transformer(
+            embeddings.transpose(1, 2), past_key_values=past_key_values, return_dict=return_dict
+        )
+        if return_dict:
+            past_key_values = encoder_outputs.get("past_key_values")
+        elif len(encoder_outputs) > 1:
+            past_key_values = encoder_outputs[1]
+        embeddings = encoder_outputs[0].transpose(1, 2)
+        embeddings = self.downsample(embeddings)
+
+        codes = self.quantizer.encode(embeddings, num_quantizers)
+        codes = codes.transpose(0, 1)
+        return codes, past_key_values
+
+    def encode(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: torch.Tensor = None,
+        num_quantizers: Optional[float] = None,
+        encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, Optional[torch.Tensor]], MimiEncoderOutput]:
+        """
+        Encodes the input audio waveform into discrete codes.
+
+        Args:
+            input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Float values of the input audio waveform.
+            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+                for *masked*.
+            num_quantizers (`int`, *optional*):
+                Number of quantizers (i.e codebooks) to use. By default, all quantizers are used.
+            encoder_past_key_values (`Cache`, *optional*):
+                Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the encoder transformer.
+                This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+                The model will output the same cache format that is fed as input.
+
+                If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+                have their past key value states given to this model).
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        Returns:
+            `codebook` of shape `[batch_size, num_codebooks, frames]`, the discrete encoded codes for the input audio waveform.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        num_quantizers = self.config.num_quantizers if num_quantizers is None else num_quantizers
+
+        if num_quantizers > self.config.num_quantizers:
+            raise ValueError(
+                f"The number of quantizers (i.e codebooks) asked should be lower than the total number of quantizers {self.config.num_quantizers}, but is currently {num_quantizers}."
+            )
+
+        _, channels, input_length = input_values.shape
+
+        if channels < 1 or channels > 2:
+            raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")
+
+        if padding_mask is None:
+            padding_mask = torch.ones_like(input_values).bool()
+
+        encoded_frames, encoder_past_key_values = self._encode_frame(
+            input_values,
+            num_quantizers,
+            padding_mask.bool(),
+            past_key_values=encoder_past_key_values,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return (
+                encoded_frames,
+                encoder_past_key_values,
+            )
+
+        return MimiEncoderOutput(encoded_frames, encoder_past_key_values)
+
+    def _decode_frame(
+        self,
+        codes: torch.Tensor,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.Tensor:
+        embeddings = self.quantizer.decode(codes)
+
+        embeddings = self.upsample(embeddings)
+        decoder_outputs = self.decoder_transformer(
+            embeddings.transpose(1, 2), past_key_values=past_key_values, return_dict=return_dict
+        )
+        if return_dict:
+            past_key_values = decoder_outputs.get("past_key_values")
+        elif len(decoder_outputs) > 1:
+            past_key_values = decoder_outputs[1]
+        embeddings = decoder_outputs[0].transpose(1, 2)
+        outputs = self.decoder(embeddings)
+        return outputs, past_key_values
+
+    def decode(
+        self,
+        audio_codes: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], MimiDecoderOutput]:
+        """
+        Decodes the given frames into an output audio waveform.
+
+        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
+        trimmed.
+
+        Args:
+            audio_codes (`torch.LongTensor`  of shape `(batch_size, num_quantizers, codes_length)`, *optional*):
+                Discret code embeddings computed using `model.encode`.
+            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+                for *masked*.
+            decoder_past_key_values (`Cache`, *optional*):
+                Pre-computed hidden-states (key and values in the self-attention blocks) that can be used to speed up sequential decoding of the decoder transformer.
+                This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+                The model will output the same cache format that is fed as input.
+
+                If `past_key_values` are used, the user can optionally input only the last `audio_values` or `audio_codes (those that don't
+                have their past key value states given to this model).
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        audio_values, decoder_past_key_values = self._decode_frame(
+            audio_codes, past_key_values=decoder_past_key_values, return_dict=return_dict
+        )
+
+        # truncate based on padding mask
+        if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
+            audio_values = audio_values[..., : padding_mask.shape[-1]]
+
+        if not return_dict:
+            return (
+                audio_values,
+                decoder_past_key_values,
+            )
+        return MimiDecoderOutput(audio_values, decoder_past_key_values)
+
+    @add_start_docstrings_to_model_forward(MIMI_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MimiOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        num_quantizers: Optional[int] = None,
+        audio_codes: Optional[torch.Tensor] = None,
+        encoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        decoder_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], MimiOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from transformers import AutoFeatureExtractor, MimiModel
+
+        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> model_id = "kyutai/mimi"
+        >>> model = MimiModel.from_pretrained(model_id)
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
+
+        >>> inputs = feature_extractor(raw_audio=audio_sample, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> audio_codes = outputs.audio_codes
+        >>> audio_values = outputs.audio_values
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if padding_mask is None:
+            padding_mask = torch.ones_like(input_values).bool()
+
+        if audio_codes is None:
+            encoder_outputs = self.encode(
+                input_values, padding_mask, num_quantizers, encoder_past_key_values, return_dict=return_dict
+            )
+            audio_codes = encoder_outputs[0]
+            if return_dict:
+                encoder_past_key_values = encoder_outputs.get("past_key_values")
+            elif len(encoder_outputs) > 1:
+                encoder_past_key_values = encoder_outputs[1]
+
+        decoder_outputs = self.decode(audio_codes, padding_mask, decoder_past_key_values, return_dict=return_dict)
+        audio_values = decoder_outputs[0]
+        if return_dict:
+            decoder_past_key_values = decoder_outputs.get("past_key_values")
+        elif len(decoder_outputs) > 1:
+            decoder_past_key_values = decoder_outputs[1]
+
+        if not return_dict:
+            return (audio_codes, audio_values, encoder_past_key_values, decoder_past_key_values)
+
+        return MimiOutput(
+            audio_codes=audio_codes,
+            audio_values=audio_values,
+            encoder_past_key_values=encoder_past_key_values,
+            decoder_past_key_values=decoder_past_key_values,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 2db7b38b5803..5f8ae6b5fbff 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5840,6 +5840,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class MimiModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MimiPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MistralForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/mimi/__init__.py b/tests/models/mimi/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
new file mode 100644
index 000000000000..dd0f77421be7
--- /dev/null
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -0,0 +1,890 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Mimi model."""
+
+import inspect
+import os
+import tempfile
+import unittest
+
+import numpy as np
+from datasets import Audio, load_dataset
+from packaging import version
+from parameterized import parameterized
+from pytest import mark
+
+from transformers import AutoFeatureExtractor, MimiConfig
+from transformers.testing_utils import (
+    is_flaky,
+    is_torch_available,
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    require_torch_sdpa,
+    slow,
+    torch_device,
+)
+from transformers.utils import (
+    is_torch_bf16_available_on_device,
+    is_torch_fp16_available_on_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import MimiModel
+
+
+# Copied from transformers.tests.encodec.test_modeling_encodec.prepare_inputs_dict
+def prepare_inputs_dict(
+    config,
+    input_ids=None,
+    input_values=None,
+    decoder_input_ids=None,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if input_ids is not None:
+        encoder_dict = {"input_ids": input_ids}
+    else:
+        encoder_dict = {"input_values": input_values}
+
+    decoder_dict = {"decoder_input_ids": decoder_input_ids} if decoder_input_ids is not None else {}
+
+    return {**encoder_dict, **decoder_dict}
+
+
+@require_torch
+class MimiModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=5,
+        num_channels=1,
+        is_training=False,
+        intermediate_size=40,
+        hidden_size=32,
+        num_filters=8,
+        num_residual_layers=1,
+        upsampling_ratios=[8, 4],
+        codebook_size=64,
+        vector_quantization_hidden_dimension=64,
+        codebook_dim=64,
+        upsample_groups=32,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        sliding_window=4,
+        use_cache=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.intermediate_size = intermediate_size
+        self.hidden_size = hidden_size
+        self.num_filters = num_filters
+        self.num_residual_layers = num_residual_layers
+        self.upsampling_ratios = upsampling_ratios
+        self.codebook_size = codebook_size
+        self.vector_quantization_hidden_dimension = vector_quantization_hidden_dimension
+        self.codebook_dim = codebook_dim
+        self.upsample_groups = upsample_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.sliding_window = sliding_window
+        self.use_cache = use_cache
+
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.num_channels, self.intermediate_size], scale=1.0)
+        config = self.get_config()
+        inputs_dict = {"input_values": input_values}
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_model_class(self, model_class):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        inputs_dict["audio_codes"] = ids_tensor([self.batch_size, 1, self.num_channels], self.codebook_size).type(
+            torch.int32
+        )
+
+        return config, inputs_dict
+
+    def get_config(self):
+        return MimiConfig(
+            audio_channels=self.num_channels,
+            chunk_in_sec=None,
+            hidden_size=self.hidden_size,
+            num_filters=self.num_filters,
+            num_residual_layers=self.num_residual_layers,
+            upsampling_ratios=self.upsampling_ratios,
+            codebook_size=self.codebook_size,
+            vector_quantization_hidden_dimension=self.vector_quantization_hidden_dimension,
+            upsample_groups=self.upsample_groups,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            sliding_window=self.sliding_window,
+            codebook_dim=self.codebook_dim,
+            use_cache=self.use_cache,
+        )
+
+    def create_and_check_model_forward(self, config, inputs_dict):
+        model = MimiModel(config=config).to(torch_device).eval()
+
+        input_values = inputs_dict["input_values"]
+        result = model(input_values)
+        self.parent.assertEqual(
+            result.audio_values.shape, (self.batch_size, self.num_channels, self.intermediate_size)
+        )
+
+
+@require_torch
+class MimiModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (MimiModel,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_headmasking = False
+    test_resize_embeddings = False
+    test_torchscript = False
+    input_name = "input_values"
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        # model does support returning hidden states
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+        if "output_attentions" in inputs_dict:
+            inputs_dict.pop("output_attentions")
+        if "output_hidden_states" in inputs_dict:
+            inputs_dict.pop("output_hidden_states")
+        return inputs_dict
+
+    def setUp(self):
+        self.model_tester = MimiModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=MimiConfig, hidden_size=37, common_properties=[], has_text_modality=False
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["input_values", "padding_mask", "num_quantizers"]
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    @unittest.skip(reason="The MimiModel does not have `inputs_embeds` logics")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="The MimiModel does not have `inputs_embeds` logics")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip(reason="The MimiModel does not have the usual `attention` logic")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="The MimiModel does not have the usual `attention` logic")
+    def test_torchscript_output_attentions(self):
+        pass
+
+    @unittest.skip(reason="The MimiModel does not have the usual `hidden_states` logic")
+    def test_torchscript_output_hidden_state(self):
+        pass
+
+    # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest._create_and_check_torchscript
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            self.skipTest(reason="test_torchscript is set to False")
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        configs_no_init.return_dict = False
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            main_input_name = model_class.main_input_name
+
+            try:
+                main_input = inputs[main_input_name]
+                model(main_input)
+                traced_model = torch.jit.trace(model, main_input)
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            non_persistent_buffers = {}
+            for key in loaded_model_state_dict.keys():
+                if key not in model_state_dict.keys():
+                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+
+            loaded_model_state_dict = {
+                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+            }
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            model_buffers = list(model.buffers())
+            for non_persistent_buffer in non_persistent_buffers.values():
+                found_buffer = False
+                for i, model_buffer in enumerate(model_buffers):
+                    if torch.equal(non_persistent_buffer, model_buffer):
+                        found_buffer = True
+                        break
+
+                self.assertTrue(found_buffer)
+                model_buffers.pop(i)
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                if layer_name in loaded_model_state_dict:
+                    p2 = loaded_model_state_dict[layer_name]
+                    if p1.data.ne(p2.data).sum() > 0:
+                        models_equal = False
+
+            self.assertTrue(models_equal)
+
+            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+            # (Even with this call, there are still memory leak by ~0.04MB)
+            self.clear_torch_jit_class_registry()
+
+    @unittest.skip(reason="The MimiModel does not have the usual `attention` logic")
+    def test_attention_outputs(self):
+        pass
+
+    @unittest.skip(reason="The MimiModel does not have the usual `hidden_states` logic")
+    def test_hidden_states_output(self):
+        pass
+
+    # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_determinism
+    def test_determinism(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_determinism(first, second):
+            # outputs are not tensors but list (since each sequence don't have the same frame_length)
+            out_1 = first.cpu().numpy()
+            out_2 = second.cpu().numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            self.assertLessEqual(max_diff, 1e-5)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                first = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+                second = model(**self._prepare_for_class(inputs_dict, model_class))[0]
+
+            if isinstance(first, tuple) and isinstance(second, tuple):
+                for tensor1, tensor2 in zip(first, second):
+                    check_determinism(tensor1, tensor2)
+            else:
+                check_determinism(first, second)
+
+    # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_model_outputs_equivalence
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                dict_output = model(**dict_inputs, return_dict=True, **additional_kwargs)
+
+                self.assertTrue(isinstance(tuple_output, tuple))
+                self.assertTrue(isinstance(dict_output, dict))
+
+                for tuple_value, dict_value in zip(tuple_output, dict_output.values()):
+                    self.assertTrue(
+                        torch.allclose(
+                            set_nan_tensor_to_zero(tuple_value), set_nan_tensor_to_zero(dict_value), atol=1e-5
+                        ),
+                        msg=(
+                            "Tuple and dict output are not equal. Difference:"
+                            f" {torch.max(torch.abs(tuple_value - dict_value))}. Tuple has `nan`:"
+                            f" {torch.isnan(tuple_value).any()} and `inf`: {torch.isinf(tuple_value)}. Dict has"
+                            f" `nan`: {torch.isnan(dict_value).any()} and `inf`: {torch.isinf(dict_value)}."
+                        ),
+                    )
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = ["conv", "input_proj", "output_proj"]
+                if param.requires_grad:
+                    if any(x in name for x in uniform_init_parms):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    # Copied from transformers.tests.encodec.test_modeling_encodec.MimiModelTest.test_identity_shortcut
+    def test_identity_shortcut(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        config.use_conv_shortcut = False
+        self.model_tester.create_and_check_model_forward(config, inputs_dict)
+
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        if not self.has_attentions:
+            self.skipTest(reason="Model architecture does not support attentions")
+
+        if not self.all_model_classes[0]._supports_sdpa:
+            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
+
+        if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device):
+            self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)")
+
+        if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device):
+            self.skipTest(
+                f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)"
+            )
+
+        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
+        if torch_dtype == "float16":
+            torch_dtype = torch.float16
+        elif torch_dtype == "bfloat16":
+            torch_dtype = torch.bfloat16
+        elif torch_dtype == "float32":
+            torch_dtype = torch.float32
+
+        atols = {
+            ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.bfloat16): 1e-2,
+            ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.bfloat16): 1e-2,
+            ("cuda", False, torch.float32): 1e-6,
+            ("cuda", False, torch.bfloat16): 1e-2,
+            ("cuda", False, torch.float16): 5e-3,
+            ("cuda", True, torch.float32): 1e-6,
+            ("cuda", True, torch.bfloat16): 1e-2,
+            ("cuda", True, torch.float16): 5e-3,
+        }
+        rtols = {
+            ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.bfloat16): 1e-2,
+            ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.bfloat16): 1e-2,
+            ("cuda", False, torch.float32): 1e-4,
+            ("cuda", False, torch.bfloat16): 1e-2,
+            ("cuda", False, torch.float16): 5e-3,
+            ("cuda", True, torch.float32): 1e-4,
+            ("cuda", True, torch.bfloat16): 3e-2,
+            ("cuda", True, torch.float16): 5e-3,
+        }
+
+        def get_mean_reldiff(failcase, x, ref, atol, rtol):
+            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+            # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors.
+            # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask.
+            # This means that the class needs to be instantiated much later, after `use_mask` is set, which means a significant refactor of the code.
+            # However masking there is not done at any layers that matters (i.e self-attention), therefore we can safely deactivate it.
+            deactivate_mask = "use_mask_token" in inspect.signature(model_class).parameters
+
+            is_encoder_decoder = model.config.is_encoder_decoder
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch_dtype,
+                    attn_implementation="eager",
+                )
+                model_eager = model_eager.eval().to(torch_device)
+
+                self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+                for name, submodule in model_eager.named_modules():
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                        raise ValueError("The eager model should not have SDPA attention layers")
+
+                has_sdpa = False
+                for name, submodule in model_sdpa.named_modules():
+                    class_name = submodule.__class__.__name__
+                    if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
+                        has_sdpa = True
+                        break
+                if not has_sdpa and model_sdpa.config.model_type != "falcon":
+                    raise ValueError("The SDPA model should have SDPA attention layers")
+
+                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model,
+                # but it would be nicer to have an efficient way to use parameterized.expand
+                fail_cases = []
+                for padding_side in ["left", "right"]:
+                    for use_mask in [False, True]:
+                        for output_attentions in [True, False]:
+                            can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters
+                            if not (self.has_attentions and can_output_attn) and output_attentions:
+                                continue
+                            for batch_size in [1, 5]:
+                                dummy_input = inputs_dict[model.main_input_name]
+
+                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
+                                    dummy_input = dummy_input.to(torch_dtype)
+
+                                dummy_input = dummy_input[:batch_size]
+                                if dummy_input.shape[0] != batch_size:
+                                    if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
+                                        extension = torch.rand(
+                                            batch_size - dummy_input.shape[0],
+                                            *dummy_input.shape[1:],
+                                            dtype=torch_dtype,
+                                            device=torch_device,
+                                        )
+                                        dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
+                                    else:
+                                        extension = torch.randint(
+                                            high=5,
+                                            size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]),
+                                            dtype=dummy_input.dtype,
+                                            device=torch_device,
+                                        )
+                                        dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
+
+                                if not use_mask:
+                                    dummy_attention_mask = None
+                                else:
+                                    dummy_attention_mask = inputs_dict.get("attention_mask", None)
+                                    if dummy_attention_mask is None:
+                                        if is_encoder_decoder:
+                                            seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
+                                        else:
+                                            seqlen = dummy_input.shape[-1]
+                                        dummy_attention_mask = (
+                                            torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
+                                        )
+
+                                    dummy_attention_mask = dummy_attention_mask[:batch_size]
+                                    if dummy_attention_mask.shape[0] != batch_size:
+                                        extension = torch.ones(
+                                            batch_size - dummy_attention_mask.shape[0],
+                                            *dummy_attention_mask.shape[1:],
+                                            dtype=dummy_attention_mask.dtype,
+                                            device=torch_device,
+                                        )
+                                        dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
+                                        dummy_attention_mask = dummy_attention_mask.to(torch_device)
+
+                                    dummy_attention_mask[:] = 1
+                                    if padding_side == "left":
+                                        dummy_attention_mask[-1, :-1] = 1
+                                        dummy_attention_mask[-1, -4:] = 0
+                                    elif padding_side == "right":
+                                        dummy_attention_mask[-1, 1:] = 1
+                                        dummy_attention_mask[-1, :3] = 0
+
+                                for enable_kernels in [False, True]:
+                                    failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
+                                    if is_encoder_decoder:
+                                        decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[
+                                            :batch_size
+                                        ]
+                                        if decoder_input_ids.shape[0] != batch_size:
+                                            extension = torch.ones(
+                                                batch_size - decoder_input_ids.shape[0],
+                                                *decoder_input_ids.shape[1:],
+                                                dtype=decoder_input_ids.dtype,
+                                                device=torch_device,
+                                            )
+                                            decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
+                                            decoder_input_ids = decoder_input_ids.to(torch_device)
+
+                                        # TODO: never an `attention_mask` arg here?
+                                        processed_inputs = {
+                                            model.main_input_name: dummy_input,
+                                            "decoder_input_ids": decoder_input_ids,
+                                            "decoder_attention_mask": dummy_attention_mask,
+                                            "output_hidden_states": True,
+                                        }
+                                    else:
+                                        processed_inputs = {
+                                            model.main_input_name: dummy_input,
+                                            "output_hidden_states": True,
+                                        }
+
+                                        # Otherwise fails for e.g. WhisperEncoderModel
+                                        if "attention_mask" in inspect.signature(model_eager.forward).parameters:
+                                            processed_inputs["attention_mask"] = dummy_attention_mask
+
+                                        if (
+                                            self.has_attentions
+                                            and "output_attentions" in inspect.signature(model_sdpa.forward).parameters
+                                        ):
+                                            processed_inputs["output_attentions"] = output_attentions
+                                    if not deactivate_mask and (
+                                        "bool_masked_pos" in inspect.signature(model_eager.forward).parameters
+                                    ):
+                                        dummy_mask = torch.ones((self.model_tester.num_masks,))
+
+                                        # In case of additional token (like class) we define a custom `mask_length`
+                                        if hasattr(self.model_tester, "mask_length"):
+                                            mask_length = self.model_tester.mask_length - dummy_mask.size(0)
+                                        else:
+                                            mask_length = self.model_tester.seq_length - dummy_mask.size(0)
+                                        dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)])
+                                        dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool()
+                                        processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device)
+
+                                    if "noise" in inspect.signature(model_eager.forward).parameters:
+                                        np.random.seed(2)
+                                        num_patches = int(
+                                            (self.model_tester.image_size // self.model_tester.patch_size) ** 2
+                                        )
+                                        noise = np.random.uniform(size=(batch_size, num_patches))
+                                        processed_inputs["noise"] = torch.from_numpy(noise)
+
+                                    # TODO: test gradients as well (& for FA2 as well!)
+                                    with torch.no_grad():
+                                        with torch.backends.cuda.sdp_kernel(
+                                            enable_flash=enable_kernels,
+                                            enable_math=True,
+                                            enable_mem_efficient=enable_kernels,
+                                        ):
+                                            prepared_inputs = self._prepare_for_class(processed_inputs, model_class)
+                                            outputs_eager = model_eager(**prepared_inputs)
+                                            outputs_sdpa = model_sdpa(**prepared_inputs)
+
+                                    # Ignore copy
+                                    logits_eager = outputs_eager.audio_values
+                                    # Ignore copy
+                                    logits_sdpa = outputs_sdpa.audio_values
+
+                                    if torch_device in ["cpu", "cuda"]:
+                                        atol = atols[torch_device, enable_kernels, torch_dtype]
+                                        rtol = rtols[torch_device, enable_kernels, torch_dtype]
+                                    else:
+                                        atol = 1e-7
+                                        rtol = 1e-4
+
+                                    # Masked tokens output slightly deviates - we don't mind that.
+                                    if use_mask:
+                                        if padding_side == "left":
+                                            sub_sdpa = logits_sdpa[:-1]
+                                            sub_eager = logits_eager[:-1]
+                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                                fail_cases.append(
+                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+                                                )
+
+                                            sub_sdpa = logits_sdpa[-1, :-4]
+                                            sub_eager = logits_eager[-1, :-4]
+                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                                fail_cases.append(
+                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+                                                )
+
+                                            # Testing the padding tokens is not really meaningful but anyway
+                                            # sub_sdpa = logits_sdpa[-1, -4:]
+                                            # sub_eager = logits_eager[-1, -4:]
+                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                        elif padding_side == "right":
+                                            sub_sdpa = logits_sdpa[:-1]
+                                            sub_eager = logits_eager[:-1]
+                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                                fail_cases.append(
+                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+                                                )
+
+                                            sub_sdpa = logits_sdpa[-1, 3:]
+                                            sub_eager = logits_eager[-1, 3:]
+                                            if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                                fail_cases.append(
+                                                    get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+                                                )
+
+                                            # Testing the padding tokens is not really meaningful but anyway
+                                            # sub_sdpa = logits_sdpa[-1, :3]
+                                            # sub_eager = logits_eager[-1, :3]
+                                            # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                            #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+
+                                    else:
+                                        if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
+                                            fail_cases.append(
+                                                get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                            )
+
+                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
+
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    @is_flaky()
+    def test_flash_attn_2_inference_equivalence(self):
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_fa = model_class.from_pretrained(
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+                )
+                model_fa.to(torch_device)
+
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
+                model.to(torch_device)
+
+                dummy_input = inputs_dict[model.main_input_name][:1]
+                if dummy_input.dtype in [torch.float32, torch.float16]:
+                    dummy_input = dummy_input.to(torch.bfloat16)
+
+                outputs = model(dummy_input)
+                outputs_fa = model_fa(dummy_input)
+
+                logits = outputs[1]
+                logits_fa = outputs_fa[1]
+
+                assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2)
+
+    @unittest.skip(reason="The MimiModel does not support right padding")
+    def test_flash_attn_2_inference_equivalence_right_padding(self):
+        pass
+
+    @unittest.skip(reason="The MimiModel does not have support dynamic compile yet")
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
+    # For now, Let's focus only on GPU for `torch.compile`
+    @slow
+    @require_torch_gpu
+    def test_torch_compile(self):
+        if version.parse(torch.__version__) < version.parse("2.3"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        n_iter = 3
+        for model_class in self.all_model_classes:
+            model = model_class(config).to(torch_device)
+            model.forward = torch.compile(model.forward)
+            for i in range(n_iter):
+                _ = model(inputs_dict["input_values"].to(torch_device))
+
+    @is_flaky()
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
+
+# Copied from transformers.tests.encodec.test_modeling_encodec.normalize
+def normalize(arr):
+    norm = np.linalg.norm(arr)
+    normalized_arr = arr / norm
+    return normalized_arr
+
+
+# Copied from transformers.tests.encodec.test_modeling_encodec.compute_rmse
+def compute_rmse(arr1, arr2):
+    arr1_normalized = normalize(arr1)
+    arr2_normalized = normalize(arr2)
+    return np.sqrt(((arr1_normalized - arr2_normalized) ** 2).mean())
+
+
+@slow
+@require_torch
+class MimiIntegrationTest(unittest.TestCase):
+    def test_integration_using_cache_decode(self):
+        expected_rmse = {
+            "8": 0.0018785292,
+            "32": 0.0012330565,
+        }
+
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        model_id = "kyutai/mimi"
+
+        model = MimiModel.from_pretrained(model_id, use_cache=True).to(torch_device)
+        processor = AutoFeatureExtractor.from_pretrained(model_id)
+
+        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+        audio_sample = librispeech_dummy[-1]["audio"]["array"]
+
+        inputs = processor(
+            raw_audio=audio_sample,
+            sampling_rate=processor.sampling_rate,
+            return_tensors="pt",
+        ).to(torch_device)
+
+        for num_codebooks, expected_rmse in expected_rmse.items():
+            with torch.no_grad():
+                # use max bandwith for best possible reconstruction
+                encoder_outputs = model.encode(inputs["input_values"], num_quantizers=int(num_codebooks))
+
+                audio_codes = encoder_outputs[0]
+
+                decoder_outputs_first_part = model.decode(audio_codes[:, :, : audio_codes.shape[2] // 2])
+                decoder_outputs_second_part = model.decode(
+                    audio_codes[:, :, audio_codes.shape[2] // 2 :],
+                    decoder_past_key_values=decoder_outputs_first_part.decoder_past_key_values,
+                )
+
+                audio_output_entire_context = model.decode(audio_codes)[0]
+                audio_output_concat_context = torch.cat(
+                    [decoder_outputs_first_part[0], decoder_outputs_second_part[0]], dim=2
+                )
+
+            # make sure audios are more or less equal
+            # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+            rmse = compute_rmse(
+                audio_output_concat_context.squeeze().cpu().numpy(),
+                audio_output_entire_context.squeeze().cpu().numpy(),
+            )
+            self.assertTrue(rmse < 1e-3)
+
+    def test_integration(self):
+        expected_rmses = {
+            "8": 0.0018785292,
+            "32": 0.0012330565,
+        }
+        expected_codesums = {
+            "8": 430423,
+            "32": 1803071,
+        }
+        librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        model_id = "kyutai/mimi"
+
+        processor = AutoFeatureExtractor.from_pretrained(model_id)
+
+        librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+        audio_sample = librispeech_dummy[-1]["audio"]["array"]
+
+        inputs = processor(
+            raw_audio=audio_sample,
+            sampling_rate=processor.sampling_rate,
+            return_tensors="pt",
+        ).to(torch_device)
+
+        for use_cache in [False, True]:
+            model = MimiModel.from_pretrained(model_id, use_cache=use_cache).to(torch_device)
+            for num_codebooks, expected_rmse in expected_rmses.items():
+                with torch.no_grad():
+                    # use max bandwith for best possible reconstruction
+                    encoder_outputs = model.encode(inputs["input_values"], num_quantizers=int(num_codebooks))
+
+                    audio_code_sums = encoder_outputs[0].sum().cpu().item()
+
+                    # make sure audio encoded codes are correct
+                    # assert relative difference less than a threshold, because `audio_code_sums` varies a bit
+                    # depending on torch version
+                    self.assertTrue(
+                        np.abs(audio_code_sums - expected_codesums[num_codebooks]) <= (3e-3 * audio_code_sums)
+                    )
+
+                    input_values_dec = model.decode(encoder_outputs[0], padding_mask=inputs["padding_mask"])[0]
+                    input_values_enc_dec = model(
+                        inputs["input_values"], inputs["padding_mask"], num_quantizers=int(num_codebooks)
+                    )[1]
+
+                # make sure forward and decode gives same result
+                self.assertTrue(torch.allclose(input_values_dec, input_values_enc_dec))
+
+                # make sure shape matches
+                self.assertTrue(inputs["input_values"].shape == input_values_enc_dec.shape)
+
+                arr = inputs["input_values"][0].cpu().numpy()
+                arr_enc_dec = input_values_enc_dec[0].cpu().numpy()
+
+                # make sure audios are more or less equal
+                # the RMSE of two random gaussian noise vectors with ~N(0, 1) is around 1.0
+                rmse = compute_rmse(arr, arr_enc_dec)
+                self.assertTrue(np.abs(rmse - expected_rmse) < 1e-5)

From e40bb4845e0eefb52ec1e9cac9c2446ab36aef81 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Thu, 19 Sep 2024 09:56:52 +0200
Subject: [PATCH 46/67] Load and save video-processor from separate folder
 (#33562)

* load and save from video-processor folder

* Update src/transformers/models/llava_onevision/processing_llava_onevision.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../image_processing_llava_onevision.py       |  1 +
 .../processing_llava_onevision.py             | 53 ++++++++++++++++++-
 .../test_processing_llava_onevision.py        | 21 ++++----
 tests/test_processing_common.py               |  8 +++
 4 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
index 3dddcdd148a4..204755720837 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -621,6 +621,7 @@ def preprocess(
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
         image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints
         resample = resample if resample is not None else self.resample
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index e050ec3f31de..d4ae02e0bb15 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -17,6 +17,7 @@
 """
 
 import math
+import os
 import sys
 from typing import Iterable, List, Union
 
@@ -34,6 +35,11 @@
     ProcessorMixin,
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+from ..auto import AutoImageProcessor
+
+
+logger = logging.get_logger(__name__)
 
 
 class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
@@ -96,7 +102,7 @@ def __init__(
         chat_template=None,
         image_token="<image>",
         video_token="<video>",
-        **kwargs: Unpack[LlavaOnevisionProcessorKwargs],
+        **kwargs,
     ):
         self.num_image_tokens = num_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -109,7 +115,7 @@ def __call__(
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         videos: VideoInput = None,
-        **kwargs,
+        **kwargs: Unpack[LlavaOnevisionProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -272,3 +278,46 @@ def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    # override to save video-config in a separate config file
+    def save_pretrained(self, save_directory, **kwargs):
+        if os.path.isfile(save_directory):
+            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        video_processor_path = os.path.join(save_directory, "video_processor")
+        self.video_processor.save_pretrained(video_processor_path)
+
+        video_processor_present = "video_processor" in self.attributes
+        if video_processor_present:
+            self.attributes.remove("video_processor")
+
+        outputs = super().save_pretrained(save_directory, **kwargs)
+
+        if video_processor_present:
+            self.attributes += ["video_processor"]
+        return outputs
+
+    # override to load video-config from a separate config file
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
+        if isinstance(processor, tuple):
+            processor = processor[0]
+
+        try:
+            video_processor = AutoImageProcessor.from_pretrained(
+                pretrained_model_name_or_path, subfolder="video_processor"
+            )
+            processor.video_processor = video_processor
+        except EnvironmentError:
+            # this means users are using prev version of saved processor where we had only one preprocessor_config.json
+            # for loading back that should work and load a LlavaOnevisionVideoProcessor class
+            logger.info(
+                "You are loading `LlavaOnevisionProcessor` but the indicated `path` doesn't contain a folder called "
+                "`video_processor`. It is strongly recommended to load and save the processor again so the video processor is saved "
+                "in a separate config."
+            )
+
+        return processor
diff --git a/tests/models/llava_onevision/test_processing_llava_onevision.py b/tests/models/llava_onevision/test_processing_llava_onevision.py
index f747c18250b6..1f998ca4bc04 100644
--- a/tests/models/llava_onevision/test_processing_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processing_llava_onevision.py
@@ -58,15 +58,16 @@ def get_video_processor(self, **kwargs):
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).video_processor
 
     def prepare_processor_dict(self):
-        return {"chat_template": "dummy_template"}
+        return {"chat_template": "dummy_template", "num_image_tokens": 6, "vision_feature_select_strategy": "default"}
 
-    @unittest.skip(
-        "Skip because the model has no processor kwargs except for chat template and"
-        "chat template is saved as a separate file. Stop skipping this test when the processor"
-        "has new kwargs saved in config file."
-    )
     def test_processor_to_json_string(self):
-        pass
+        processor = self.get_processor()
+        obj = json.loads(processor.to_json_string())
+        for key, value in self.prepare_processor_dict().items():
+            # chat_tempalate are tested as a separate test because they are saved in separate files
+            if key != "chat_template":
+                self.assertEqual(obj[key], value)
+                self.assertEqual(getattr(processor, key, None), value)
 
     # Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
     def test_chat_template_is_saved(self):
@@ -191,7 +192,7 @@ def test_unstructured_kwargs_batched(self):
             max_length=76,
         )
         self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 5)
+        self.assertEqual(len(inputs["input_ids"][0]), 4)
 
     @require_torch
     @require_vision
@@ -282,7 +283,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
-        self.assertEqual(len(inputs["input_ids"][0]), 112)
+        self.assertEqual(len(inputs["input_ids"][0]), 2)
 
     @require_vision
     @require_torch
@@ -299,4 +300,4 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs["input_ids"][0]), 117)
+        self.assertEqual(len(inputs["input_ids"][0]), 2)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index f3111f5d5785..306f3100fb8d 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -112,6 +112,14 @@ def test_processor_from_and_save_pretrained(self):
 
                 self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
 
+                for attribute in processor_first.attributes:
+                    attribute_first = getattr(processor_first, attribute)
+                    attribute_second = getattr(processor_second, attribute)
+
+                    # tokenizer repr contains model-path from where we loaded
+                    if "tokenizer" not in attribute:
+                        self.assertEqual(repr(attribute_first), repr(attribute_second))
+
     # These kwargs-related tests ensure that processors are correctly instantiated.
     # they need to be applied only if an image_processor exists.
 

From d7975a58749cb5f2102a518858eef43ee3614def Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Thu, 19 Sep 2024 12:04:24 +0200
Subject: [PATCH 47/67] VLMs: enable generation tests (#33533)

* add tests

* fix whisper

* update

* nit

* add qwen2-vl

* more updates!

* better this way

* fix this one

* fix more tests

* fix final tests, hope so

* fix led

* Update tests/generation/test_utils.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* pr comments

* not pass pixels and extra for low-mem tests, very flaky because of visio tower

---------

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 src/transformers/generation/utils.py          |   4 +-
 .../models/paligemma/modeling_paligemma.py    | 126 ++++++++-
 .../models/qwen2_vl/modeling_qwen2_vl.py      |   4 +-
 tests/generation/test_utils.py                | 260 ++++++++++++------
 .../test_modeling_bigbird_pegasus.py          |   7 +-
 tests/models/bloom/test_modeling_bloom.py     |   4 -
 tests/models/git/test_modeling_git.py         |  55 ++++
 tests/models/led/test_modeling_led.py         |   8 +
 tests/models/llava/test_modeling_llava.py     |  15 +-
 .../llava_next/test_modeling_llava_next.py    |  22 +-
 .../test_modeling_llava_next_video.py         |  28 +-
 .../test_modeling_llava_onevision.py          |  27 +-
 .../models/musicgen/test_modeling_musicgen.py |  12 +-
 .../test_modeling_musicgen_melody.py          |  12 +-
 .../paligemma/test_modeling_paligemma.py      |  15 +-
 .../models/qwen2_vl/test_modeling_qwen2_vl.py |  27 +-
 .../models/reformer/test_modeling_reformer.py |   7 +-
 .../test_modeling_speech_to_text.py           |   4 +-
 .../video_llava/test_modeling_video_llava.py  |  41 ++-
 .../models/vipllava/test_modeling_vipllava.py |  16 +-
 tests/models/whisper/test_modeling_whisper.py |  13 -
 tests/test_modeling_common.py                 |   2 +-
 22 files changed, 501 insertions(+), 208 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index d8896f91267d..a4cb000cf462 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1154,7 +1154,7 @@ def _validate_assistant(self, assistant_model):
                     "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper."
                 )
 
-        if not self.config.vocab_size == assistant_model.config.vocab_size:
+        if not self.config.get_text_config().vocab_size == assistant_model.config.get_text_config().vocab_size:
             raise ValueError("Make sure the main and assistant model use the same tokenizer")
 
     def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
@@ -1476,7 +1476,7 @@ def get_layer_device_map(execution_device_map: Optional[dict] = None):
             layer_device_map = get_layer_device_map(execution_device_map)
 
             cache_kwargs = {
-                "config": self.config if hasattr(self.config, "text_config") else self.config,
+                "config": self.config.get_text_config(),
                 "max_batch_size": batch_size,
                 "max_cache_len": max_cache_len,
                 "device": device,
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 39ee57d70bb5..4e456b9f08e2 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -45,6 +45,74 @@
 _CONFIG_FOR_DOC = "PaliGemmaConfig"
 
 
+# Adapted from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+# But Paligemma has no causal mask on prefix
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+    is_training: bool,
+    token_type_ids: torch.Tensor,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+        is_training (`bool`):
+            Whether the model is in training mode or in inference. The condition is checked by presence/absence of `token_type_ids/labels`
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+        if sequence_length != 1:
+            if is_training:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            else:
+                causal_mask = torch.zeros_like(causal_mask)
+
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
+            if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
+                )
+    return causal_mask
+
+
 @dataclass
 class PaliGemmaCausalLMOutputWithPast(ModelOutput):
     """
@@ -285,7 +353,7 @@ def _update_causal_mask(
         self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False
     ):
         using_static_cache = isinstance(past_key_values, StaticCache)
-        dtype, device = inputs_embeds.dtype, inputs_embeds.device
+        dtype = inputs_embeds.dtype
         min_dtype = torch.finfo(dtype).min
         sequence_length = inputs_embeds.shape[1]
         if using_static_cache:
@@ -299,19 +367,19 @@ def _update_causal_mask(
 
         if attention_mask is not None and attention_mask.dim() == 4:
             # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-            causal_mask = attention_mask
-        else:
-            causal_mask = torch.full(
-                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
-            )
-            # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
-            if sequence_length != 1:
-                if is_training:
-                    causal_mask = torch.triu(causal_mask, diagonal=1)
-                else:
-                    causal_mask = torch.zeros_like(causal_mask)
-
-        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            return attention_mask
+
+        causal_mask = torch.full(
+            (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+        )
+        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+        if sequence_length != 1:
+            if is_training:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            else:
+                causal_mask = torch.zeros_like(causal_mask)
+
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
         causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
         if attention_mask is not None:
             causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
@@ -420,7 +488,8 @@ def forward(
             image_features = self.multi_modal_projector(selected_image_feature)
             image_features = image_features / (self.config.hidden_size**0.5)
 
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
             if inputs_embeds[special_image_mask].numel() != image_features.numel():
                 image_tokens_in_text = torch.sum(input_ids == self.config.image_token_index)
                 raise ValueError(
@@ -508,11 +577,38 @@ def prepare_inputs_for_generation(
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             cache_position=cache_position,
+            use_cache=use_cache,
             num_logits_to_keep=num_logits_to_keep,
             **kwargs,
         )
 
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+
+            dtype = self.get_output_embeddings().weight.dtype
+            min_dtype = torch.finfo(dtype).min
+            is_training = token_type_ids is not None and kwargs.get("labels", None) is not None
+
+            model_inputs["attention_mask"] = _prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_length(),
+                dtype=dtype,
+                device=device,
+                min_dtype=min_dtype,
+                cache_position=cache_position,
+                batch_size=batch_size,
+                is_training=is_training,
+                token_type_ids=token_type_ids,
+            )
+
         model_inputs["token_type_ids"] = token_type_ids
 
         # position_ids in Paligemma are 1-indexed
diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
index 4e4e04198c0b..8e46f6840ad1 100644
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@@ -1070,7 +1070,9 @@ def __init__(self, config) -> None:
         self.blocks = nn.ModuleList(
             [Qwen2VLVisionBlock(config, config._attn_implementation) for _ in range(config.depth)]
         )
-        self.merger = PatchMerger(dim=config.hidden_size, context_dim=config.embed_dim)
+        self.merger = PatchMerger(
+            dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size
+        )
 
     def get_dtype(self) -> torch.dtype:
         return self.blocks[0].mlp.fc2.weight.dtype
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 0ed054ad5869..413e920609a8 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -98,10 +98,22 @@ class GenerationTesterMixin:
 
     def _get_input_ids_and_config(self, batch_size=2):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict[self.input_name]
+        # TODO: @raushan or @gante, use `model.main_input_name` as the main input instead of relyinn on `input_ids`
+        input_ids = inputs_dict.pop(self.input_name)[:batch_size, :]
+        inputs_dict.pop("attention_mask", None)
 
-        input_ids = input_ids[:batch_size]
+        # we don't want encoder-decoder models to start from filled decoder ids
+        inputs_dict.pop("decoder_input_ids", None)
+        inputs_dict.pop("decoder_attention_mask", None)
 
+        # we'll set cache use in each test differently
+        inputs_dict.pop("use_cache", None)
+
+        inputs_dict = {
+            k: v[:batch_size, ...]
+            for k, v in inputs_dict.items()
+            if "head_mask" not in k and isinstance(v, torch.Tensor)
+        }
         if config.eos_token_id is not None and config.pad_token_id is None:
             # hack to allow generate for models such as GPT2 as is done in `generate()`
             if isinstance(config.eos_token_id, int):
@@ -118,7 +130,7 @@ def _get_input_ids_and_config(self, batch_size=2):
         config.eos_token_id = None
         config.forced_eos_token_id = None
 
-        return config, input_ids, attention_mask
+        return config, input_ids, attention_mask, inputs_dict
 
     def _get_logits_processor_kwargs(self, do_sample=False):
         logits_processor_kwargs = {
@@ -191,6 +203,7 @@ def _greedy_generate(
         model,
         input_ids,
         attention_mask,
+        inputs_dict,
         output_scores=False,
         output_logits=False,
         output_attentions=False,
@@ -213,6 +226,7 @@ def _greedy_generate(
             use_cache=use_cache,
             **logits_processor_kwargs,
             **model_kwargs,
+            **inputs_dict,
         )
 
         return output_generate
@@ -222,6 +236,7 @@ def _sample_generate(
         model,
         input_ids,
         attention_mask,
+        inputs_dict,
         num_return_sequences,
         output_scores=False,
         output_logits=False,
@@ -247,6 +262,7 @@ def _sample_generate(
             use_cache=use_cache,
             **logits_processor_kwargs,
             **model_kwargs,
+            **inputs_dict,
         )
 
         return output_generate
@@ -256,6 +272,7 @@ def _beam_search_generate(
         model,
         input_ids,
         attention_mask,
+        inputs_dict,
         beam_kwargs,
         output_scores=False,
         output_logits=False,
@@ -279,6 +296,7 @@ def _beam_search_generate(
             **beam_kwargs,
             **logits_processor_kwargs,
             **model_kwargs,
+            **inputs_dict,
         )
 
         return output_generate
@@ -288,6 +306,7 @@ def _beam_sample_generate(
         model,
         input_ids,
         attention_mask,
+        inputs_dict,
         beam_kwargs,
         output_scores=False,
         output_logits=False,
@@ -312,6 +331,7 @@ def _beam_sample_generate(
             **beam_kwargs,
             **logits_processor_kwargs,
             **model_kwargs,
+            **inputs_dict,
         )
 
         return output_generate
@@ -321,6 +341,7 @@ def _group_beam_search_generate(
         model,
         input_ids,
         attention_mask,
+        inputs_dict,
         beam_kwargs,
         output_scores=False,
         output_logits=False,
@@ -344,6 +365,7 @@ def _group_beam_search_generate(
             **beam_kwargs,
             **logits_processor_kwargs,
             **model_kwargs,
+            **inputs_dict,
         )
 
         return output_generate
@@ -353,6 +375,7 @@ def _constrained_beam_search_generate(
         model,
         input_ids,
         attention_mask,
+        inputs_dict,
         constraints,
         beam_kwargs,
         output_scores=False,
@@ -378,6 +401,7 @@ def _constrained_beam_search_generate(
             **beam_kwargs,
             **logits_processor_kwargs,
             **model_kwargs,
+            **inputs_dict,
         )
 
         return output_generate
@@ -387,6 +411,7 @@ def _contrastive_generate(
         model,
         input_ids,
         attention_mask,
+        inputs_dict,
         output_scores=False,
         output_logits=False,
         output_attentions=False,
@@ -415,6 +440,7 @@ def _contrastive_generate(
             **logits_processor_kwargs,
             **model_kwargs,
             **contrastive_search_kwargs,
+            **inputs_dict,
         )
 
         return output_generate
@@ -422,10 +448,12 @@ def _contrastive_generate(
     @pytest.mark.generate
     def test_greedy_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
-            output_generate = self._greedy_generate(model=model, input_ids=input_ids, attention_mask=attention_mask)
+            output_generate = self._greedy_generate(
+                model=model, input_ids=input_ids, attention_mask=attention_mask, inputs_dict=inputs_dict
+            )
 
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
@@ -435,13 +463,14 @@ def test_greedy_generate(self):
     @pytest.mark.generate
     def test_greedy_generate_dict_outputs(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 output_scores=True,
                 output_logits=True,
                 output_hidden_states=True,
@@ -466,7 +495,7 @@ def test_greedy_generate_dict_outputs(self):
     @pytest.mark.generate
     def test_greedy_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason="This model doesn't support caching")
@@ -479,6 +508,7 @@ def test_greedy_generate_dict_outputs_use_cache(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 output_scores=True,
                 output_logits=True,
                 output_hidden_states=True,
@@ -497,13 +527,14 @@ def test_greedy_generate_dict_outputs_use_cache(self):
     @pytest.mark.generate
     def test_sample_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._sample_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 num_return_sequences=1,
             )
 
@@ -515,13 +546,14 @@ def test_sample_generate(self):
     @pytest.mark.generate
     def test_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._sample_generate(
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 num_return_sequences=2,
                 output_scores=True,
                 output_logits=True,
@@ -547,7 +579,7 @@ def test_sample_generate_dict_output(self):
     @pytest.mark.generate
     def test_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
 
@@ -556,6 +588,7 @@ def test_beam_search_generate(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
 
@@ -567,7 +600,7 @@ def test_beam_search_generate(self):
     @pytest.mark.generate
     def test_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -575,6 +608,7 @@ def test_beam_search_generate_dict_output(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
                 output_scores=True,
                 output_logits=True,
@@ -602,7 +636,7 @@ def test_beam_search_generate_dict_output(self):
     def test_beam_search_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             # enable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason="This model doesn't support caching")
@@ -618,6 +652,7 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
                 output_scores=True,
                 output_logits=True,
@@ -647,7 +682,7 @@ def test_model_parallel_beam_search(self):
             if model_class._no_split_modules is None:
                 continue
 
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).eval()
             with tempfile.TemporaryDirectory() as tmp_dir:
@@ -659,12 +694,13 @@ def test_model_parallel_beam_search(self):
                     attention_mask=attention_mask,
                     max_new_tokens=self.max_new_tokens,
                     num_beams=2,
+                    **inputs_dict,
                 )
 
     @pytest.mark.generate
     def test_beam_sample_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -672,6 +708,7 @@ def test_beam_sample_generate(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
 
@@ -680,28 +717,34 @@ def test_beam_sample_generate(self):
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
 
-            prepare_inputs_for_generation_args = set(inspect.signature(model.prepare_inputs_for_generation).parameters)
-            # `inputs_embeds` input is well supported when `cache_positions` is used, because it means the modeling
-            # code is up to date with our most recent standards
-            if (
-                "inputs_embeds" in prepare_inputs_for_generation_args
-                and "cache_positions" in prepare_inputs_for_generation_args
-            ):
-                input_embeds = model.get_input_embeddings()(input_ids)
-                beam_kwargs.update({"inputs_embeds": input_embeds})
-                output_generate2 = self._beam_sample_generate(
-                    model=model,
-                    input_ids=None,
-                    attention_mask=attention_mask,
-                    beam_kwargs=beam_kwargs,
+            # for VLMs inputs embeds won't match input ids unless images are encoded and merged with ids properly
+            # no quick fix available, since obtaining image embeddings step is very model-specific
+            if any(name in model.__class__.__name__.lower() for name in ("blip", "llava", "paligemma")):
+                prepare_inputs_for_generation_args = set(
+                    inspect.signature(model.prepare_inputs_for_generation).parameters
                 )
+                # `inputs_embeds` input is well supported when `cache_positions` is used, because it means the modeling
+                # code is up to date with our most recent standards
+                if (
+                    "inputs_embeds" in prepare_inputs_for_generation_args
+                    and "cache_positions" in prepare_inputs_for_generation_args
+                ):
+                    input_embeds = model.get_input_embeddings()(input_ids)
+                    beam_kwargs.update({"inputs_embeds": input_embeds})
+                    output_generate2 = self._beam_sample_generate(
+                        model=model,
+                        input_ids=None,
+                        attention_mask=attention_mask,
+                        inputs_dict={},
+                        beam_kwargs=beam_kwargs,
+                    )
 
-                torch.testing.assert_close(output_generate[:, input_embeds.shape[1] :], output_generate2)
+                    torch.testing.assert_close(output_generate[:, input_embeds.shape[1] :], output_generate2)
 
     @pytest.mark.generate
     def test_beam_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -710,6 +753,7 @@ def test_beam_sample_generate_dict_output(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
                 output_scores=True,
                 output_logits=True,
@@ -736,7 +780,7 @@ def test_beam_sample_generate_dict_output(self):
 
     @pytest.mark.generate
     def test_generate_without_input_ids(self):
-        config, _, _ = self._get_input_ids_and_config()
+        config, _, _, _ = self._get_input_ids_and_config()
 
         # if no bos token id => cannot generate from None
         if config.bos_token_id is None:
@@ -758,7 +802,7 @@ def test_generate_without_input_ids(self):
     @pytest.mark.generate
     def test_group_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
             # check `generate()` and `group_beam_search()` are equal
@@ -767,6 +811,7 @@ def test_group_beam_search_generate(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
             if model.config.is_encoder_decoder:
@@ -781,6 +826,7 @@ def test_group_beam_search_generate(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
             if model.config.is_encoder_decoder:
@@ -791,7 +837,7 @@ def test_group_beam_search_generate(self):
     @pytest.mark.generate
     def test_group_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_diverse_beam_kwargs()
@@ -799,6 +845,7 @@ def test_group_beam_search_generate_dict_output(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
                 output_scores=True,
                 output_logits=True,
@@ -827,7 +874,7 @@ def test_group_beam_search_generate_dict_output(self):
     @pytest.mark.generate
     def test_constrained_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
 
@@ -845,6 +892,7 @@ def test_constrained_beam_search_generate(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 constraints=constraints,
                 beam_kwargs=beam_kwargs,
             )
@@ -870,6 +918,7 @@ def test_constrained_beam_search_generate(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 constraints=constraints,
                 beam_kwargs=beam_kwargs,
             )
@@ -885,7 +934,7 @@ def test_constrained_beam_search_generate(self):
     @pytest.mark.generate
     def test_constrained_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
 
@@ -902,6 +951,7 @@ def test_constrained_beam_search_generate_dict_output(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 constraints=constraints,
                 beam_kwargs=beam_kwargs,
                 output_scores=True,
@@ -937,7 +987,7 @@ def test_contrastive_generate(self):
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
                 self.skipTest(reason="Won't fix: old model with different cache format")
 
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -947,7 +997,11 @@ def test_contrastive_generate(self):
             # test old generation output for backwards compatibility
             model = model_class(config).to(torch_device).eval()
             output_generate = self._contrastive_generate(
-                model=model, input_ids=input_ids, attention_mask=attention_mask, use_cache=True
+                model=model,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
+                use_cache=True,
             )
             if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
@@ -964,7 +1018,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
                 self.skipTest(reason="Won't fix: old model with different cache format")
 
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -976,6 +1030,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
                 model=model,
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                inputs_dict=inputs_dict,
                 output_scores=True,
                 output_logits=True,
                 output_hidden_states=True,
@@ -1003,7 +1058,7 @@ def test_contrastive_generate_low_memory(self):
             if any(model_name in model_class.__name__.lower() for model_name in ["gptbigcode"]):
                 self.skipTest(reason="TODO: fix me")
 
-            config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1)
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1021,6 +1076,7 @@ def test_contrastive_generate_low_memory(self):
                 low_memory=True,
                 max_new_tokens=self.max_new_tokens,
                 attention_mask=attention_mask,
+                **inputs_dict,
                 use_cache=True,
             )
 
@@ -1031,6 +1087,7 @@ def test_contrastive_generate_low_memory(self):
                 low_memory=False,
                 max_new_tokens=self.max_new_tokens,
                 attention_mask=attention_mask,
+                **inputs_dict,
                 use_cache=True,
             )
             self.assertListEqual(low_output.tolist(), high_output.tolist())
@@ -1055,7 +1112,7 @@ def test_beam_search_low_memory(self):
                 ]
             ):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
-            config, input_ids, _ = self._get_input_ids_and_config(batch_size=2)
+            config, input_ids, _, _ = self._get_input_ids_and_config(batch_size=2)
             # batch_size=1 is ok, but batch_size>1 will cause non-identical output
 
             config.use_cache = True
@@ -1065,7 +1122,12 @@ def test_beam_search_low_memory(self):
             model = model_class(config).to(torch_device).eval()
 
             low_output = model.generate(
-                input_ids, max_new_tokens=8, num_beams=5, early_stopping=True, low_memory=True, use_cache=True
+                input_ids,
+                max_new_tokens=8,
+                num_beams=5,
+                early_stopping=True,
+                low_memory=True,
+                use_cache=True,
             )
 
             high_output = model.generate(
@@ -1114,7 +1176,7 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
 
             # enable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1)
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1140,7 +1202,9 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
                 "return_dict_in_generate": True,
                 "use_cache": True,
             }
-            output_greedy = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            output_greedy = model.generate(
+                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
+            )
 
             # test with the same assistant model or randomly init one
             # in the first case all candidate tokens are accepted, in the second none is accepted
@@ -1152,7 +1216,9 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
             assistant_model.generation_config.num_assistant_tokens = 2  # see b)
             assistant_model.generation_config.num_assistant_tokens_schedule = "constant"  # see b)
             generation_kwargs.update({"assistant_model": assistant_model})
-            output_assisted = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            output_assisted = model.generate(
+                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
+            )
 
             # The two outputs must match and their shape must be as expected
 
@@ -1187,7 +1253,7 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
 
             # enable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1)
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1214,10 +1280,14 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
                 "use_cache": True,
             }
 
-            output_greedy = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            output_greedy = model.generate(
+                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
+            )
 
             generation_kwargs.update({"prompt_lookup_num_tokens": 2})  # see b)
-            output_prompt_lookup = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            output_prompt_lookup = model.generate(
+                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
+            )
 
             # The two outputs must match and their shape must be as expected
 
@@ -1239,7 +1309,7 @@ def test_dola_decoding_sample(self):
                 self.skipTest("DoLa is not supported for models that don't return layerwise hidden states")
 
             # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             # Encoder-decoder models are not supported
             if config.is_encoder_decoder:
@@ -1267,7 +1337,7 @@ def test_dola_decoding_sample(self):
             }
             generation_kwargs.update({"dola_layers": "low"})
             model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
-            output_dola = model.generate(input_ids, **model_kwargs, **generation_kwargs)
+            output_dola = model.generate(input_ids, **model_kwargs, **generation_kwargs, **inputs_dict)
             self._check_outputs(output_dola, input_ids, model.config, use_cache=hasattr(config, "use_cache"))
 
     @pytest.mark.generate
@@ -1296,7 +1366,7 @@ def test_assisted_decoding_sample(self):
                 self.skipTest(reason="May fix in the future: need model-specific fixes")
 
             # enable cache
-            config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1)
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1326,9 +1396,11 @@ def test_assisted_decoding_sample(self):
                 "return_dict_in_generate": True,
                 "use_cache": True,
             }
-            output_assisted = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            output_assisted = model.generate(
+                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
+            )
 
-            self._check_outputs(output_assisted, input_ids, model.config, use_cache=True)
+            self._check_outputs(output_assisted, input_ids, config, use_cache=True)
 
     @pytest.mark.generate
     def test_prompt_lookup_decoding_stops_at_eos(self):
@@ -1364,7 +1436,7 @@ def test_generate_with_head_masking(self):
         """Test designed for encoder-decoder models to ensure the attention head masking is used."""
         attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
         for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
             # We want to test only encoder-decoder models
             if not config.is_encoder_decoder:
                 continue
@@ -1394,6 +1466,7 @@ def test_generate_with_head_masking(self):
                     return_dict_in_generate=True,
                     remove_invalid_values=True,
                     **{name: mask},
+                    **inputs_dict,
                 )
                 # We check the state of decoder_attentions and cross_attentions just from the last step
                 attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
@@ -1416,7 +1489,7 @@ def test_left_padding_compatibility(self):
         # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
         decoder_only_classes = []
         for model_class in self.all_generative_model_classes:
-            config, _, _ = self._get_input_ids_and_config()
+            config, _, _, _ = self._get_input_ids_and_config()
             if config.is_encoder_decoder:
                 continue
             else:
@@ -1449,7 +1522,7 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature):
             return model_kwargs
 
         for model_class in decoder_only_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, _ = self._get_input_ids_and_config()
             model = model_class(config).to(torch_device).eval()
             signature = inspect.signature(model.forward).parameters.keys()
 
@@ -1462,7 +1535,9 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature):
 
             # With left-padding (length 32)
             # can hardcode pad_token to be 0 as we'll do attn masking anyway
-            pad_token_id = config.pad_token_id if getattr(config, "pad_token_id") is not None else 0
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
             pad_size = (input_ids.shape[0], 32)
             padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
             padded_input_ids = torch.cat((padding, input_ids), dim=1)
@@ -1550,7 +1625,7 @@ def test_generate_from_inputs_embeds_decoder_only(self):
         # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
         # if fails, you should probably update the `prepare_inputs_for_generation` function
         for model_class in self.all_generative_model_classes:
-            config, input_ids, _ = self._get_input_ids_and_config()
+            config, input_ids, _, _ = self._get_input_ids_and_config()
 
             # Ignore:
             # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
@@ -1572,25 +1647,23 @@ def test_generate_from_inputs_embeds_decoder_only(self):
                 continue
 
             # Traditional way of generating text
-            outputs_from_ids = model.generate(input_ids)
-            self.assertEqual(outputs_from_ids.shape, (2, 20))
+            outputs_from_ids = model.generate(input_ids, max_new_tokens=5)
+            self.assertEqual(outputs_from_ids.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
 
             # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
             inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(input_ids, inputs_embeds=inputs_embeds)
+            outputs_from_embeds = model.generate(input_ids, inputs_embeds=inputs_embeds, max_new_tokens=5)
             self.assertListEqual(outputs_from_ids.tolist(), outputs_from_embeds.tolist())
 
             # But if we pass different inputs_embeds, we should get different outputs
             torch.manual_seed(0)
             random_embeds = torch.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(input_ids, inputs_embeds=random_embeds)
+            outputs_from_rand_embeds = model.generate(input_ids, inputs_embeds=random_embeds, max_new_tokens=5)
             with self.assertRaises(AssertionError):
                 self.assertListEqual(outputs_from_rand_embeds.tolist(), outputs_from_embeds.tolist())
 
             # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
-            outputs_from_embeds_wo_ids = model.generate(
-                inputs_embeds=inputs_embeds, max_new_tokens=20 - inputs_embeds.shape[1]
-            )
+            outputs_from_embeds_wo_ids = model.generate(inputs_embeds=inputs_embeds, max_new_tokens=5)
             self.assertListEqual(
                 outputs_from_embeds[:, inputs_embeds.shape[1] :].tolist(),
                 outputs_from_embeds_wo_ids.tolist(),
@@ -1607,7 +1680,7 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
             if not model_class._supports_static_cache:
                 self.skipTest(reason="This model does not support the static cache format")
 
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
             if config.is_encoder_decoder:
                 self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
 
@@ -1621,27 +1694,30 @@ def test_generate_from_inputs_embeds_with_static_cache(self):
             max_cache_len = 30
 
             # here we force to not stop at eos and go until max-length
-            model.generation_config.eos_token_id = model.config.eos_token_id = -1
+            model.generation_config.eos_token_id = model.config.get_text_config().eos_token_id = -1
             generation_kwargs = {
                 "max_length": max_cache_len,
                 "cache_implementation": "static",
                 "return_dict_in_generate": True,  # Required to return `past_key_values`
             }
 
+            text_config = model.config.get_text_config()
             head_dim = (
-                model.config.head_dim
-                if hasattr(model.config, "head_dim")
-                else model.config.hidden_size // model.config.num_attention_heads
+                text_config.head_dim
+                if hasattr(text_config, "head_dim")
+                else text_config.hidden_size // text_config.num_attention_heads
             )
             num_key_value_heads = (
-                model.config.num_attention_heads
-                if getattr(config, "num_key_value_heads", None) is None
-                else model.config.num_key_value_heads
+                text_config.num_attention_heads
+                if getattr(text_config, "num_key_value_heads", None) is None
+                else text_config.num_key_value_heads
             )
-            num_hidden_layers = config.num_hidden_layers
+            num_hidden_layers = text_config.num_hidden_layers
 
             inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs = model.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
+            outputs = model.generate(
+                inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
+            )
 
             # we should get `max_length` in shape, not `max_length - embeds_length`
             cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim)
@@ -1742,7 +1818,7 @@ def test_new_cache_format(self, num_beams, do_sample):
             if not model_class._supports_cache_class:
                 self.skipTest(reason="This model does not support the new cache format")
 
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
 
             model = model_class(config).to(torch_device).eval()
             generation_kwargs = {
@@ -1757,7 +1833,9 @@ def test_new_cache_format(self, num_beams, do_sample):
             # Sets seed before calling `generate` for the case with do_sample=True
             seed = torch.randint(0, 1000000, (1,)).item()
             set_seed(seed)
-            legacy_results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            legacy_results = model.generate(
+                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict
+            )
             set_seed(seed)
             if config.is_encoder_decoder:
                 cache_cls = EncoderDecoderCache
@@ -1766,7 +1844,11 @@ def test_new_cache_format(self, num_beams, do_sample):
                 cache_cls = DynamicCache
                 past_key_values = cache_cls()
             new_results = model.generate(
-                input_ids, attention_mask=attention_mask, past_key_values=past_key_values, **generation_kwargs
+                input_ids,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                **generation_kwargs,
+                **inputs_dict,
             )
 
             # The two sets of generated sequences must match, despite the cache format between forward passes being
@@ -1810,7 +1892,7 @@ def test_generate_with_static_cache(self):
             if not model_class._supports_static_cache:
                 self.skipTest(reason="This model does not support the static cache format")
 
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
             if config.is_encoder_decoder:
                 self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
 
@@ -1838,7 +1920,7 @@ def test_generate_with_static_cache(self):
                 else config.num_key_value_heads
             )
             num_hidden_layers = config.num_hidden_layers
-            results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict)
 
             cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim)
             self.assertTrue(isinstance(results.past_key_values, StaticCache))
@@ -1852,7 +1934,7 @@ def test_generate_with_quant_cache(self):
             if not model_class._supports_quantized_cache:
                 self.skipTest(reason="This model does not support the quantized cache format")
 
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
             config.is_decoder = True
 
             model = model_class(config).to(torch_device).eval()
@@ -1865,7 +1947,7 @@ def test_generate_with_quant_cache(self):
                 "use_cache": True,
             }
 
-            results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict)
             self.assertTrue(isinstance(results.past_key_values, QuantoQuantizedCache))
 
             # passing past key values of different type should raise Error
@@ -1931,7 +2013,7 @@ def test_generate_methods_with_num_logits_to_keep(self):
             if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
                 self.skipTest(reason="This model does not support `num_logits_to_keep` argument.")
 
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
             config.use_cache = True
             config.is_decoder = True
 
@@ -1946,10 +2028,12 @@ def test_generate_methods_with_num_logits_to_keep(self):
 
             # Setting num_logits_to_keep at 0 keeps all logits (old behavior)
             with_all_logits = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, num_logits_to_keep=0
+                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict, num_logits_to_keep=0
             )
             # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior)
-            without_all_logits = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            without_all_logits = model.generate(
+                input_ids, attention_mask=attention_mask, **inputs_dict, **generation_kwargs
+            )
             self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
 
     def test_assisted_decoding_with_num_logits_to_keep(self):
@@ -1959,7 +2043,7 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
             if model_class._is_stateful:
                 self.skipTest(reason="Stateful models don't support assisted generation")
 
-            config, input_ids, attention_mask = self._get_input_ids_and_config(batch_size=1)
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config(batch_size=1)
             config.use_cache = True
             config.is_decoder = True
 
@@ -1976,10 +2060,12 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
 
             # Setting num_logits_to_keep at 0 keeps all logits (old behavior)
             with_all_logits = model.generate(
-                input_ids, attention_mask=attention_mask, **generation_kwargs, num_logits_to_keep=0
+                input_ids, attention_mask=attention_mask, **generation_kwargs, **inputs_dict, num_logits_to_keep=0
             )
             # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior)
-            without_all_logits = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            without_all_logits = model.generate(
+                input_ids, attention_mask=attention_mask, **inputs_dict, **generation_kwargs
+            )
             self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
 
     def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
index da909a7c4eb0..0f28fc2d67b5 100644
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@@ -289,7 +289,10 @@ def _get_input_ids_and_config(self, batch_size=2):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.attention_type = "original_full"
 
-        input_ids = inputs_dict[self.input_name]
+        input_ids = inputs_dict.pop(self.input_name)
+        _ = inputs_dict.pop("attention_mask", None)
+        _ = inputs_dict.pop("decoder_input_ids", None)
+        _ = inputs_dict.pop("decoder_attention_mask", None)
         attention_mask = torch.ones_like(input_ids, dtype=torch.long)
 
         # cut to half length & take max batch_size 3
@@ -300,7 +303,7 @@ def _get_input_ids_and_config(self, batch_size=2):
         if config.eos_token_id is not None and config.pad_token_id is None:
             # hack to allow generate for models such as GPT2 as is done in `generate()`
             config.pad_token_id = config.eos_token_id
-        return config, input_ids, attention_mask
+        return config, input_ids, attention_mask, inputs_dict
 
     def setUp(self):
         self.model_tester = BigBirdPegasusModelTester(self)
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index f17ee1170ab2..b20012c2a197 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -389,10 +389,6 @@ def test_bloom_weight_initialization(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bloom_weight_initialization(*config_and_inputs)
 
-    @unittest.skip(reason="Bloom has a non-standard KV cache format.")
-    def test_past_key_values_format(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "bigscience/bigscience-small-testing"
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index b0e2e892ec13..02a3c033c452 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -450,6 +450,53 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def _check_attentions_for_generate(
+        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        # GIT attention shape depends on image inputs, overwrite
+        self.assertIsInstance(attentions, tuple)
+        self.assertListEqual(
+            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
+        )
+        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
+        image_length = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1)
+
+        for idx, iter_attentions in enumerate(attentions):
+            tgt_len = min_length + idx + image_length if not use_cache else 1
+            src_len = min_length + idx + image_length
+
+            expected_shape = (
+                batch_size * num_beam_groups,
+                config.num_attention_heads,
+                tgt_len,
+                src_len,
+            )
+            # check attn size
+            self.assertListEqual(
+                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
+            )
+
+    def _check_hidden_states_for_generate(
+        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
+    ):
+        # GIT attention shape depends on image inputs, overwrite
+        self.assertIsInstance(hidden_states, tuple)
+        self.assertListEqual(
+            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
+            [True] * len(hidden_states),
+        )
+        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
+        image_length = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1)
+
+        for idx, iter_hidden_states in enumerate(hidden_states):
+            seq_len = min_length + idx + image_length if not use_cache else 1
+            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
+            # check hidden size
+            self.assertListEqual(
+                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
+                [expected_shape] * len(iter_hidden_states),
+            )
+
     @slow
     def test_model_from_pretrained(self):
         model_name = "microsoft/git-base"
@@ -468,10 +515,18 @@ def test_contrastive_generate(self):
     def test_contrastive_generate_dict_outputs_use_cache(self):
         pass
 
+    @unittest.skip(reason="GIT has pixel values as additional input")
+    def test_contrastive_generate_low_memory(self):
+        pass
+
     @unittest.skip(reason="GIT has pixel values as additional input")
     def test_greedy_generate_dict_outputs_use_cache(self):
         pass
 
+    @unittest.skip(reason="GIT has pixel values as additional input")
+    def test_dola_decoding_sample(self):
+        pass
+
 
 @require_torch
 @require_vision
diff --git a/tests/models/led/test_modeling_led.py b/tests/models/led/test_modeling_led.py
index 2247a64374dd..a4d81ab2e1c6 100644
--- a/tests/models/led/test_modeling_led.py
+++ b/tests/models/led/test_modeling_led.py
@@ -338,6 +338,14 @@ def test_global_attention(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_global_attention(*config_and_inputs)
 
+    def _get_input_ids_and_config(self, batch_size=2):
+        config, input_ids, attention_mask, inputs_dict = GenerationTesterMixin._get_input_ids_and_config(
+            self, batch_size=batch_size
+        )
+        # LED computes attention scores based on mask indices if `is_global`
+        inputs_dict.pop("global_attention_mask")
+        return config, input_ids, attention_mask, inputs_dict
+
     # LEDForSequenceClassification does not support inputs_embeds
     def test_inputs_embeds(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 305fc9e9a84c..e183c38a59f7 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -36,6 +36,7 @@
     torch_device,
 )
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 
@@ -80,7 +81,7 @@ def __init__(
             "initializer_range": 0.02,
             "num_labels": 3,
             "num_choices": 4,
-            "pad_token_id": 0,
+            "pad_token_id": 1,
         },
         is_training=True,
         vision_config={
@@ -106,7 +107,7 @@ def __init__(
         self.vision_feature_layer = vision_feature_layer
         self.text_config = text_config
         self.vision_config = vision_config
-        self.seq_length = seq_length
+        self.pad_token_id = text_config["pad_token_id"]
 
         self.num_hidden_layers = text_config["num_hidden_layers"]
         self.vocab_size = text_config["vocab_size"]
@@ -118,6 +119,8 @@ def __init__(
         self.num_channels = 3
         self.image_size = 336
         self.encoder_seq_length = 231
+        self.num_image_tokens = 224
+        self.seq_length = seq_length + self.num_image_tokens
 
     def get_config(self):
         return LlavaConfig(
@@ -128,6 +131,7 @@ def get_config(self):
             projector_hidden_act=self.projector_hidden_act,
             vision_feature_select_strategy=self.vision_feature_select_strategy,
             vision_feature_layer=self.vision_feature_layer,
+            image_seq_length=self.num_image_tokens,
         )
 
     def prepare_config_and_inputs(self):
@@ -148,8 +152,8 @@ def prepare_config_and_inputs_for_common(self):
         config, pixel_values = config_and_inputs
         input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
         attention_mask = input_ids.ne(1).to(torch_device)
-        # we are giving 3 images let's make sure we pass in 3 image tokens
-        input_ids[:, 1] = config.image_token_index
+        input_ids[input_ids == config.image_token_index] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = config.image_token_index
         inputs_dict = {
             "pixel_values": pixel_values,
             "input_ids": input_ids,
@@ -172,12 +176,13 @@ def create_and_check_llava_model_fp16_forward(self, config, input_ids, pixel_val
 
 
 @require_torch
-class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
+class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     """
     Model tester for `LlavaForConditionalGeneration`.
     """
 
     all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
     pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
     test_pruning = False
     test_head_masking = False
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index bd0b5a190646..772f19e13a4b 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -86,12 +86,12 @@ def __init__(
             "initializer_range": 0.02,
             "num_labels": 3,
             "num_choices": 4,
-            "pad_token_id": 0,
+            "pad_token_id": 1,
         },
         is_training=True,
         vision_config={
             "image_size": 16,
-            "patch_size": 2,
+            "patch_size": 4,
             "num_channels": 3,
             "is_training": True,
             "hidden_size": 32,
@@ -112,7 +112,7 @@ def __init__(
         self.vision_feature_layer = vision_feature_layer
         self.text_config = text_config
         self.vision_config = vision_config
-        self.seq_length = seq_length
+        self.pad_token_id = text_config["pad_token_id"]
 
         self.num_hidden_layers = text_config["num_hidden_layers"]
         self.vocab_size = text_config["vocab_size"]
@@ -123,8 +123,10 @@ def __init__(
         self.batch_size = 3
         self.num_channels = 3
         self.image_size = 30
-        self.encoder_seq_length = 342
+        self.encoder_seq_length = 95
         self.image_grid_pinpoints = [[32, 32]]
+        self.num_image_tokens = 88
+        self.seq_length = seq_length + self.num_image_tokens
 
     def get_config(self):
         return LlavaNextConfig(
@@ -136,6 +138,7 @@ def get_config(self):
             vision_feature_select_strategy=self.vision_feature_select_strategy,
             vision_feature_layer=self.vision_feature_layer,
             image_grid_pinpoints=self.image_grid_pinpoints,
+            image_seq_length=self.num_image_tokens,
         )
 
     def prepare_config_and_inputs(self):
@@ -157,11 +160,10 @@ def prepare_config_and_inputs_for_common(self):
         config, pixel_values = config_and_inputs
         input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
-        # we are giving 3 images let's make sure we pass in 3 image tokens
-        input_ids[:, 1] = config.image_token_index
-        labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device)
-        # maskout where the image token is
-        labels[:, 1] == self.ignore_index
+        input_ids[input_ids == config.image_token_index] = self.pad_token_id
+
+        input_ids[:, : self.num_image_tokens] = config.image_token_index
+
         inputs_dict = {
             "pixel_values": pixel_values,
             "image_sizes": torch.tensor(
@@ -169,7 +171,6 @@ def prepare_config_and_inputs_for_common(self):
             ),
             "input_ids": input_ids,
             "attention_mask": attention_mask,
-            "labels": labels,
         }
         return config, inputs_dict
 
@@ -214,6 +215,7 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
     """
 
     all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
 
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index 35df4085df05..30eaa7fb050c 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -87,12 +87,12 @@ def __init__(
             "initializer_range": 0.02,
             "num_labels": 3,
             "num_choices": 4,
-            "pad_token_id": 0,
+            "pad_token_id": 2,
         },
         is_training=True,
         vision_config={
             "image_size": 16,
-            "patch_size": 2,
+            "patch_size": 4,
             "num_channels": 3,
             "is_training": True,
             "hidden_size": 32,
@@ -114,7 +114,7 @@ def __init__(
         self.vision_feature_layer = vision_feature_layer
         self.text_config = text_config
         self.vision_config = vision_config
-        self.seq_length = seq_length
+        self.pad_token_id = text_config["pad_token_id"]
 
         self.num_hidden_layers = text_config["num_hidden_layers"]
         self.vocab_size = text_config["vocab_size"]
@@ -125,8 +125,11 @@ def __init__(
         self.batch_size = 3
         self.num_channels = 3
         self.image_size = 30
-        self.encoder_seq_length = 469
+        self.encoder_seq_length = 127
         self.image_grid_pinpoints = [[32, 32]]
+        self.num_image_tokens = 88
+        self.num_video_tokens = 32
+        self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens
 
     def get_config(self):
         return LlavaNextVideoConfig(
@@ -139,6 +142,8 @@ def get_config(self):
             vision_feature_select_strategy=self.vision_feature_select_strategy,
             vision_feature_layer=self.vision_feature_layer,
             image_grid_pinpoints=self.image_grid_pinpoints,
+            video_seq_length=self.num_video_tokens,
+            image_seq_length=self.num_image_tokens,
         )
 
     def prepare_config_and_inputs(self):
@@ -168,13 +173,12 @@ def prepare_config_and_inputs_for_common(self):
         config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs()
         input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
-        # we are giving 3 images and videos let's make sure we pass in 3 special tokens
-        input_ids[:, 1] = config.image_token_index
-        input_ids[:, 2] = config.video_token_index
-        labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device)
-        # maskout where the image/video token is
-        labels[:, 1] == self.ignore_index
-        labels[:, 2] == self.ignore_index
+
+        input_ids[input_ids == config.image_token_index] = self.pad_token_id
+        input_ids[input_ids == config.video_token_index] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = config.image_token_index
+        input_ids[:, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens] = config.video_token_index
+
         inputs_dict = {
             "pixel_values": pixel_values,
             "pixel_values_videos": pixel_values_videos,
@@ -183,7 +187,6 @@ def prepare_config_and_inputs_for_common(self):
             ),
             "input_ids": input_ids,
             "attention_mask": attention_mask,
-            "labels": labels,
         }
         return config, inputs_dict
 
@@ -230,6 +233,7 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
     """
 
     all_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
 
diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py
index 2300e7b29c48..0e9c88cb3463 100644
--- a/tests/models/llava_onevision/test_modeling_llava_onevision.py
+++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py
@@ -60,7 +60,7 @@ def __init__(
         self,
         parent,
         ignore_index=-100,
-        image_token_index=0,
+        image_token_index=1,
         projector_hidden_act="gelu",
         seq_length=7,
         vision_feature_select_strategy="full",
@@ -92,7 +92,7 @@ def __init__(
         is_training=True,
         vision_config={
             "image_size": 16,
-            "patch_size": 2,
+            "patch_size": 8,
             "num_channels": 3,
             "is_training": True,
             "hidden_size": 32,
@@ -113,7 +113,9 @@ def __init__(
         self.vision_feature_layer = vision_feature_layer
         self.text_config = text_config
         self.vision_config = vision_config
-        self.seq_length = seq_length
+        self.pad_token_id = text_config["pad_token_id"]
+        self.num_image_tokens = 10
+        self.seq_length = seq_length + self.num_image_tokens
 
         self.num_hidden_layers = text_config["num_hidden_layers"]
         self.vocab_size = text_config["vocab_size"]
@@ -124,8 +126,7 @@ def __init__(
         self.batch_size = 3
         self.num_channels = 3
         self.image_size = 30
-        self.encoder_seq_length = 7
-        self.image_grid_pinpoints = [[32, 32]]
+        self.image_grid_pinpoints = [[16, 16]]
 
     def get_config(self):
         return LlavaOnevisionConfig(
@@ -143,7 +144,7 @@ def prepare_config_and_inputs(self):
         pixel_values = floats_tensor(
             [
                 self.batch_size,
-                9,
+                3,
                 self.vision_config["num_channels"],
                 self.vision_config["image_size"],
                 self.vision_config["image_size"],
@@ -158,16 +159,16 @@ def prepare_config_and_inputs_for_common(self):
         config, pixel_values = config_and_inputs
         input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
-        # we are giving 3 images let's make sure we pass in 3 image tokens
-        input_ids[:, 1] = config.image_token_index
+
+        input_ids[input_ids == config.image_token_index] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = config.image_token_index
+
         labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device)
-        # maskout where the image token is
-        labels[:, 1] == self.ignore_index
+        labels[:, : self.num_image_tokens] == self.ignore_index
+
         inputs_dict = {
             "pixel_values": pixel_values,
-            "image_sizes": torch.tensor(
-                [[self.vision_config["image_size"], self.vision_config["image_size"]]] * self.batch_size
-            ),
+            "image_sizes": torch.tensor([[45, 45]] * self.batch_size),
             "input_ids": input_ids,
             "attention_mask": attention_mask,
             "labels": labels,
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 870a4c92767b..e143b8ac3c86 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -286,12 +286,19 @@ def _get_input_ids_and_config(self, batch_size=2):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict["input_ids"]
 
+        _ = inputs_dict.pop("attention_mask", None)
+        inputs_dict = {
+            k: v[:batch_size, ...]
+            for k, v in inputs_dict.items()
+            if "head_mask" not in k and isinstance(v, torch.Tensor)
+        }
+
         # take max batch_size
         sequence_length = input_ids.shape[-1]
         input_ids = input_ids[: batch_size * config.num_codebooks, :]
 
         attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long)
-        return config, input_ids, attention_mask
+        return config, input_ids, attention_mask, inputs_dict
 
     def _get_logits_processor_kwargs(self, do_sample=False):
         logits_processor_kwargs = {}
@@ -299,7 +306,7 @@ def _get_logits_processor_kwargs(self, do_sample=False):
 
     def test_greedy_generate_stereo_outputs(self):
         for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, inputs_dict = self._get_input_ids_and_config()
             config.audio_channels = 2
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
@@ -310,6 +317,7 @@ def test_greedy_generate_stereo_outputs(self):
                 output_hidden_states=True,
                 output_attentions=True,
                 return_dict_in_generate=True,
+                inputs_dict={},
             )
 
             self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index 9b34f4dde659..28c2bf2f168b 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -289,12 +289,19 @@ def _get_input_ids_and_config(self, batch_size=2):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         input_ids = inputs_dict["input_ids"]
 
+        _ = inputs_dict.pop("attention_mask", None)
+        inputs_dict = {
+            k: v[:batch_size, ...]
+            for k, v in inputs_dict.items()
+            if "head_mask" not in k and isinstance(v, torch.Tensor)
+        }
+
         # take max batch_size
         sequence_length = input_ids.shape[-1]
         input_ids = input_ids[: batch_size * config.num_codebooks, :]
 
         attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long)
-        return config, input_ids, attention_mask
+        return config, input_ids, attention_mask, inputs_dict
 
     def _get_logits_processor_kwargs(self, do_sample=False):
         logits_processor_kwargs = {}
@@ -302,7 +309,7 @@ def _get_logits_processor_kwargs(self, do_sample=False):
 
     def test_greedy_generate_stereo_outputs(self):
         for model_class in self.greedy_sample_model_classes:
-            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config, input_ids, attention_mask, _ = self._get_input_ids_and_config()
             config.audio_channels = 2
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
@@ -313,6 +320,7 @@ def test_greedy_generate_stereo_outputs(self):
                 output_hidden_states=True,
                 output_attentions=True,
                 return_dict_in_generate=True,
+                inputs_dict={},
             )
 
             self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput)
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index 411193db1ce3..d592205443e1 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -35,6 +35,7 @@
     torch_device,
 )
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 
@@ -82,7 +83,7 @@ def __init__(
             "initializer_range": 0.02,
             "num_labels": 3,
             "num_choices": 4,
-            "pad_token_id": 0,
+            "pad_token_id": 1,
         },
         is_training=True,
         vision_config={
@@ -115,6 +116,7 @@ def __init__(
         self.vision_config = vision_config
         self.seq_length = seq_length
         self.projection_dim = projection_dim
+        self.pad_token_id = text_config["pad_token_id"]
 
         self.num_hidden_layers = text_config["num_hidden_layers"]
         self.vocab_size = text_config["vocab_size"]
@@ -160,7 +162,7 @@ def prepare_config_and_inputs_for_common(self):
         attention_mask = input_ids.ne(1).to(torch_device)
         # set the 16 first tokens to be image, and ensure that no other tokens are image tokens
         # do not change this unless you modified image size or patch size
-        input_ids = torch.where(input_ids == config.image_token_index, 2, input_ids)
+        input_ids[input_ids == config.image_token_index] = self.pad_token_id
         input_ids[:, :16] = config.image_token_index
         inputs_dict = {
             "pixel_values": pixel_values,
@@ -173,12 +175,13 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
+class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     """
     Model tester for `PaliGemmaForConditionalGeneration`.
     """
 
     all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_torchscript = False
@@ -305,6 +308,12 @@ def test_save_load_low_cpu_mem_usage_checkpoints(self):
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
+    @unittest.skip(
+        reason="VLMs doen't accept inputs embeds and pixel values at the same time. So if the test passed for bacbone LM, it passes for VLM also"
+    )
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
 
 @slow
 @require_torch
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 6b8072fc184f..956243dccebe 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -58,11 +58,11 @@ class Qwen2VLVisionText2TextModelTester:
     def __init__(
         self,
         parent,
-        batch_size=8,
+        batch_size=2,
         seq_length=7,
         num_channels=3,
         ignore_index=-100,
-        image_size=28,
+        image_size=14,
         bos_token_id=0,
         eos_token_id=1,
         pad_token_id=2,
@@ -90,7 +90,7 @@ def __init__(
             "mlp_ratio": 4,
             "num_heads": 4,
             "patch_size": 14,
-            "spatial_merge_size": 2,
+            "spatial_merge_size": 1,
             "temporal_patch_size": 2,
         },
         rope_scaling={"type": "mrope", "mrope_section": [2, 1, 1]},
@@ -119,9 +119,10 @@ def __init__(
         self.batch_size = batch_size
         self.num_channels = num_channels
         self.image_size = image_size
-        self.seq_length = seq_length
         self.is_training = is_training
         self.vocab_size = vocab_size
+        self.num_image_tokens = 32
+        self.seq_length = seq_length + self.num_image_tokens
 
     def get_config(self):
         return Qwen2VLConfig(
@@ -162,23 +163,19 @@ def prepare_config_and_inputs(self):
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values = config_and_inputs
-        vision_seqlen = pixel_values.shape[0] // self.batch_size // (self.vision_config["spatial_merge_size"] ** 2)
-        input_ids = ids_tensor([self.batch_size, self.seq_length - 1 + vision_seqlen], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
 
         input_ids[input_ids == self.image_token_id] = self.pad_token_id
-        input_ids[:, torch.arange(vision_seqlen, device=torch_device) + 1] = self.image_token_id
+        input_ids[:, self.num_image_tokens] = self.image_token_id
         labels = torch.zeros(
-            (self.batch_size, self.seq_length - 1 + vision_seqlen),
+            (self.batch_size, self.seq_length),
             dtype=torch.long,
             device=torch_device,
         )
-        patch_size = self.vision_config["patch_size"]
         inputs_dict = {
             "pixel_values": pixel_values,
-            "image_grid_thw": torch.tensor(
-                [[1, self.image_size // patch_size, self.image_size // patch_size]] * self.batch_size
-            ),
+            "image_grid_thw": torch.tensor([[1, 1, 1]] * self.batch_size),
             "input_ids": input_ids,
             "attention_mask": attention_mask,
             "labels": labels,
@@ -312,6 +309,12 @@ def test_model_is_small(self):
     def test_beam_search_low_memory(self):
         pass
 
+    @unittest.skip(
+        reason="VLMs can't generate from inputs embeds and pixels. This can be tested as part of bacbone LM, no need to run the tes for VLMs"
+    )
+    def test_generate_from_inputs_embeds_with_static_cache(self):
+        pass
+
 
 @require_torch
 class Qwen2VLIntegrationTest(unittest.TestCase):
diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py
index ba0a9232847a..11c2e821975d 100644
--- a/tests/models/reformer/test_modeling_reformer.py
+++ b/tests/models/reformer/test_modeling_reformer.py
@@ -689,12 +689,15 @@ def _get_input_ids_and_config(self, batch_size=2):
         # decreasing the seq_length in tester causes errors for "training_tests", those need exactly max seq length
         # NOTE: seq_length has to be multiple of 4, otherwise it fails for other tests
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict[self.input_name]
+        input_ids = inputs_dict.pop(self.input_name)
+        _ = inputs_dict.pop("attention_mask", None)
+        _ = inputs_dict.pop("decoder_input_ids", None)
+        _ = inputs_dict.pop("decoder_attention_mask", None)
         input_ids = input_ids[:batch_size, :16]
         attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:batch_size, :16]
         config.eos_token_id = None
         config.forced_eos_token_id = None
-        return config, input_ids, attention_mask
+        return config, input_ids, attention_mask, inputs_dict
 
 
 @require_torch
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 4f19cc01b33c..9a7b34211c1c 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -285,7 +285,7 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
     input_name = "input_features"
 
     def _get_input_ids_and_config(self, batch_size=2):
-        config, input_ids, attention_mask = GenerationTesterMixin._get_input_ids_and_config(self)
+        config, input_ids, attention_mask, inputs_dict = GenerationTesterMixin._get_input_ids_and_config(self)
 
         # `input_ids` is actually `input_features` which is a 3D tensor.
         # We must overwrite the mask to make it 2D since the original `_get_input_ids_and_config` creates an
@@ -294,7 +294,7 @@ def _get_input_ids_and_config(self, batch_size=2):
             sequence_length = input_ids.shape[1]
             attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=attention_mask.device)
 
-        return config, input_ids, attention_mask
+        return config, input_ids, attention_mask, inputs_dict
 
     def setUp(self):
         self.model_tester = Speech2TextModelTester(self)
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 1f8883453754..df8fe0b5dca2 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -75,14 +75,14 @@ def __init__(
             "initializer_range": 0.02,
             "num_labels": 3,
             "num_choices": 4,
-            "pad_token_id": 0,
+            "pad_token_id": 3,
         },
         is_training=True,
         vision_config={
             "model_type": "clip_vision_model",
             "batch_size": 12,
             "image_size": 30,
-            "patch_size": 2,
+            "patch_size": 6,
             "num_channels": 3,
             "is_training": True,
             "hidden_size": 32,
@@ -104,8 +104,8 @@ def __init__(
         self.vision_feature_layer = vision_feature_layer
         self.text_config = text_config
         self.vision_config = vision_config
-        self.seq_length = seq_length
         self.num_frames = num_frames
+        self.pad_token_id = text_config["pad_token_id"]
 
         self.num_hidden_layers = text_config["num_hidden_layers"]
         self.vocab_size = text_config["vocab_size"]
@@ -116,7 +116,10 @@ def __init__(
         self.batch_size = 5
         self.num_channels = 3
         self.image_size = 224
-        self.encoder_seq_length = 2044
+        self.encoder_seq_length = 64
+        self.num_image_tokens = 25
+        self.num_video_tokens = 26
+        self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens
 
     def get_config(self):
         return VideoLlavaConfig(
@@ -128,6 +131,8 @@ def get_config(self):
             projector_hidden_act=self.projector_hidden_act,
             vision_feature_select_strategy=self.vision_feature_select_strategy,
             vision_feature_layer=self.vision_feature_layer,
+            image_seq_length=self.num_image_tokens,
+            video_seq_length=self.num_video_tokens,
         )
 
     def prepare_config_and_inputs(self):
@@ -159,11 +164,11 @@ def prepare_config_and_inputs_for_common(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
         attention_mask = input_ids.ne(1).to(torch_device)
 
-        # we are giving 3 videos and 3 images. Need to pass in image and video tokens, both
-        # also need to make sure no other special tokens are set
-        input_ids[(input_ids == 0) | (input_ids == 1)] = 3
-        input_ids[:, 0] = config.video_token_index
-        input_ids[:, 1:2] = config.image_token_index
+        input_ids[(input_ids == config.image_token_index) | (input_ids == config.video_token_index)] = (
+            self.pad_token_id
+        )
+        input_ids[:, : self.num_image_tokens] = config.image_token_index
+        input_ids[:, self.num_image_tokens : self.num_video_tokens + self.num_image_tokens] = config.video_token_index
         inputs_dict = {
             "pixel_values_videos": pixel_values_videos,
             "pixel_values_images": pixel_values_images,
@@ -196,6 +201,7 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
     """
 
     all_model_classes = (VideoLlavaForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (VideoLlavaForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = True
@@ -242,16 +248,16 @@ def test_mixed_input(self):
             # if we remove some images from inputs leaving only one
             # image number mismatch error should raise
             inputs["pixel_values_images"] = inputs["pixel_values_images"][:1]
-            with self.assertRaises(ValueError):
+            with self.assertRaises(RuntimeError):
                 _ = model(**inputs)
 
     def test_video_only_input(self):
         config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device).eval()
-            # replace video_token with dummy id which is not video token id
-            # error that video-tokens and num-of-video-inputs mismatch will be raised
-            inputs["input_ids"][:, 1:2] = 2
+            # replace image token id with dummy id
+            # Error will be raised as num-image-tokens and num-of-image-embeds mismatch
+            inputs["input_ids"][:, : self.model_tester.num_image_tokens] = 2
             with self.assertRaises(ValueError):
                 _ = model(**inputs)
 
@@ -262,8 +268,13 @@ def test_image_only_input(self):
         config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
             model = model_class(config).to(torch_device).eval()
-            # set dummy id, which is not image token id, same as above
-            inputs["input_ids"][:, :1] = 2
+            # set dummy id, which is not video token id
+            # Error will be raised as num-video-tokens and num-of-video-embeds mismatch
+            inputs["input_ids"][
+                :,
+                self.model_tester.num_image_tokens : self.model_tester.num_image_tokens
+                + self.model_tester.num_video_tokens,
+            ] = 2
             with self.assertRaises(ValueError):
                 _ = model(**inputs)
 
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index c5a363da9f5d..b12f2c30c774 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -28,6 +28,7 @@
 )
 from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
 
@@ -73,7 +74,7 @@ def __init__(
             "initializer_range": 0.02,
             "num_labels": 3,
             "num_choices": 4,
-            "pad_token_id": 0,
+            "pad_token_id": 1,
         },
         is_training=True,
         vision_config={
@@ -99,7 +100,7 @@ def __init__(
         self.vision_feature_layers = vision_feature_layers
         self.text_config = text_config
         self.vision_config = vision_config
-        self.seq_length = seq_length
+        self.pad_token_id = text_config["pad_token_id"]
 
         self.num_hidden_layers = text_config["num_hidden_layers"]
         self.vocab_size = text_config["vocab_size"]
@@ -111,6 +112,8 @@ def __init__(
         self.num_channels = 3
         self.image_size = 336
         self.encoder_seq_length = 231
+        self.num_image_tokens = 224
+        self.seq_length = seq_length + self.num_image_tokens
 
     def get_config(self):
         return VipLlavaConfig(
@@ -120,6 +123,7 @@ def get_config(self):
             image_token_index=self.image_token_index,
             projector_hidden_act=self.projector_hidden_act,
             vision_feature_layers=self.vision_feature_layers,
+            image_seq_length=self.num_image_tokens,
         )
 
     def prepare_config_and_inputs(self):
@@ -140,8 +144,9 @@ def prepare_config_and_inputs_for_common(self):
         config, pixel_values = config_and_inputs
         input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
         attention_mask = input_ids.ne(1).to(torch_device)
-        # we are giving 3 images let's make sure we pass in 3 image tokens
-        input_ids[:, 1] = config.image_token_index
+
+        input_ids[input_ids == config.image_token_index] = self.pad_token_id
+        input_ids[:, : self.num_image_tokens] = config.image_token_index
         inputs_dict = {
             "pixel_values": pixel_values,
             "input_ids": input_ids,
@@ -152,12 +157,13 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 # Copied from transformers.tests.models.llava.test_modeling_llava.LlavaForConditionalGenerationModelTest with Llava->VipLlava
-class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
+class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     """
     Model tester for `VipLlavaForConditionalGeneration`.
     """
 
     all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
     test_resize_embeddings = True
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index f7ac2bc12eb2..70b38d3bf381 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -497,19 +497,6 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
-    def _get_input_ids_and_config(self, batch_size=3):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict[self.input_name]
-
-        # cut to half length & take max batch_size=batch_size
-        input_ids = input_ids[:batch_size, :, :]
-
-        if config.eos_token_id is not None and config.pad_token_id is None:
-            # hack to allow generate for models such as GPT2 as is done in `generate()`
-            config.pad_token_id = config.eos_token_id
-
-        return config, input_ids, None
-
     def test_inputs_embeds(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index c7af0b1c9f5b..8eab385ad48a 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -4744,7 +4744,7 @@ def test_static_cache_matches_dynamic(self):
                 output_logits=True,
                 return_dict_in_generate=True,
             )
-            self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4))
+            self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-3))
 
     # For now, Let's focus only on GPU for `torch.compile`
     @slow

From f3b3810fe6502a26bd51f187a2f8e1d41e295230 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 19 Sep 2024 11:55:26 +0100
Subject: [PATCH 48/67] rag: fix CI (#33578)

---
 tests/models/rag/test_modeling_rag.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py
index 392ff40d7702..d00c06344118 100644
--- a/tests/models/rag/test_modeling_rag.py
+++ b/tests/models/rag/test_modeling_rag.py
@@ -653,7 +653,7 @@ class RagDPRT5Test(RagTestMixin, unittest.TestCase):
     def config_and_inputs(self):
         question_encoder_tester = DPRModelTester(self)
         dpr_config_and_inputs = question_encoder_tester.prepare_config_and_inputs()
-        generator_tester = T5ModelTester(self, vocab_size=1100)
+        generator_tester = T5ModelTester(self, vocab_size=1101)
         t5_config_and_inputs = generator_tester.prepare_config_and_inputs()
 
         (question_encoder_config, input_ids, _, input_mask, _, _, _) = dpr_config_and_inputs

From 80b774eb2906c34265c54b9534758ffc0d619cb7 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 19 Sep 2024 12:02:46 +0100
Subject: [PATCH 49/67] Cache: don't show warning in forward passes when
 `past_key_values` is None (#33541)

---
 docs/source/en/kv_cache.md                    | 33 ++++++++++++++++---
 .../models/bloom/modeling_bloom.py            | 26 ++++++++-------
 .../models/codegen/modeling_codegen.py        | 22 ++++++++-----
 .../models/cohere/modeling_cohere.py          | 19 ++++++-----
 src/transformers/models/dbrx/modeling_dbrx.py | 19 ++++++-----
 .../models/falcon/modeling_falcon.py          | 24 ++++++++------
 src/transformers/models/gemma/diff_gemma.py   | 15 ++++++---
 .../models/gemma/modeling_gemma.py            | 28 ++++++++--------
 src/transformers/models/git/modeling_git.py   | 27 ++++++++-------
 .../models/gpt_neo/modeling_gpt_neo.py        | 24 ++++++++------
 .../models/gpt_neox/modeling_gpt_neox.py      | 22 ++++++++-----
 .../modeling_gpt_neox_japanese.py             | 22 ++++++++-----
 src/transformers/models/gptj/modeling_gptj.py | 22 ++++++++-----
 .../models/granite/modeling_granite.py        | 17 ++++++----
 .../models/idefics/modeling_idefics.py        | 14 +++++---
 .../models/idefics2/modeling_idefics2.py      | 14 ++++++--
 .../models/jetmoe/modeling_jetmoe.py          | 15 ++++++---
 .../models/llama/modeling_llama.py            | 19 ++++++-----
 .../models/mistral/modeling_mistral.py        | 17 ++++++----
 .../models/mixtral/modeling_mixtral.py        | 27 ++++++++-------
 src/transformers/models/olmo/modeling_olmo.py | 19 ++++++-----
 .../models/olmoe/modeling_olmoe.py            | 17 ++++++----
 .../models/persimmon/modeling_persimmon.py    | 27 ++++++++-------
 src/transformers/models/phi/modeling_phi.py   | 28 +++++++++-------
 src/transformers/models/phi3/modeling_phi3.py | 28 +++++++++-------
 .../models/qwen2/modeling_qwen2.py            | 27 ++++++++-------
 .../models/qwen2_moe/modeling_qwen2_moe.py    | 27 ++++++++-------
 .../models/stablelm/modeling_stablelm.py      | 27 ++++++++-------
 .../models/starcoder2/modeling_starcoder2.py  | 27 ++++++++-------
 29 files changed, 402 insertions(+), 251 deletions(-)

diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md
index 1a9ea1ac0019..05ab9eafa723 100644
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@@ -120,7 +120,7 @@ To enable quantization of the key-value cache, one needs to indicate `cache_impl
 Quantization related arguments should be passed to the `generation_config` either as a `dict` or an instance of a [`~QuantizedCacheConfig`] class.
 One has to indicate which quantization backend to use in the [`~QuantizedCacheConfig`], the default is `quanto`.
 
-It is recommended to set `axis-key/axis-value` parameters in the cache config to `0` if you're using the `quanto` backend and to `1` if you're using the `HQQ` backend. For other config values, please use the defaults unless you're running out of memory. In that case, you may consider decreasing the residual length. 
+It is recommended to set `axis-key/axis-value` parameters in the cache config to `0` if you're using the `quanto` backend and to `1` if you're using the `HQQ` backend. For other config values, please use the defaults unless you're running out of memory. In that case, you may consider decreasing the residual length.
 
 <Tip warning={true}>
 
@@ -308,7 +308,7 @@ Unlike other cache classes, this one can't be used directly by indicating a `cac
 
 ### Encoder-Decoder Cache
 
-The [`~EncoderDecoderCache`] is a wrapper designed to handle the caching needs of encoder-decoder models. This cache type is specifically built to manage both self-attention and cross-attention caches, ensuring storage and retrieval of past key/values required for these complex models. Cool thing about Encoder-Decoder Cache is that you can set different cache types for the encoder and for the decoder, depending on your use case. Currently this cache is only supported in [Whisper](./model_doc/whisper) models but we will be adding more models soon. 
+The [`~EncoderDecoderCache`] is a wrapper designed to handle the caching needs of encoder-decoder models. This cache type is specifically built to manage both self-attention and cross-attention caches, ensuring storage and retrieval of past key/values required for these complex models. Cool thing about Encoder-Decoder Cache is that you can set different cache types for the encoder and for the decoder, depending on your use case. Currently this cache is only supported in [Whisper](./model_doc/whisper) models but we will be adding more models soon.
 
 In terms of usage, there is nothing special to be done and calling `generate()` or `forward()` will handle everything for you.
 
@@ -379,7 +379,7 @@ Sometimes you would want to first fill-in cache object with key/values for certa
 >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
 >>> tokenizer = AutoTokenizer.from_pretrained(model_id)
 
->>> # Init StaticCache with big enough max-length (1024 tokens for the below example) 
+>>> # Init StaticCache with big enough max-length (1024 tokens for the below example)
 >>> # You can also init a DynamicCache, if that suits you better
 >>> prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)
 
@@ -394,10 +394,35 @@ Sometimes you would want to first fill-in cache object with key/values for certa
 >>> for prompt in prompts:
 ...     new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
 ...     past_key_values = copy.deepcopy(prompt_cache)
-...     outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20) 
+...     outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
 ...     response = tokenizer.batch_decode(outputs)[0]
 ...     responses.append(response)
 
 >>> print(responses)
 ['<s> You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTitle: The Ultimate Guide to Travelling: Tips, Tricks, and', '<s> You are a helpful assistant. What is the capital of France?\n\nYes, the capital of France is Paris.</s>']
 ```
+
+
+## Legacy cache format
+
+Prior to the introduction of the `Cache` object, the cache of LLMs used to be a tuple of tuples of tensors. The legacy
+format has a dynamic size, growing as we generate text -- very similar to `DynamicCache`. If your project depend on
+this legacy format, you can seamlessly convert it to a `DynamicCache` and back.
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+>>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
+
+>>> # `return_dict_in_generate=True` is required to return the cache. `return_legacy_cache` forces the returned cache
+>>> # to be of the legacy type
+>>> generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5)
+
+>>> # We can convert a legacy cache to a DynamicCache -- and the other way around. This is helpful if you have custom
+>>> # logic to manipulate a cache in a specific format.
+>>> cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
+>>> legacy_format_cache = cache.to_legacy_cache()
+```
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index d32ab95f51c7..b5b221b6b37f 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -687,14 +687,18 @@ def forward(
             inputs_embeds = self.word_embeddings(input_ids)
 
         # kept for BC (non `Cache` `past_key_values` inputs)
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "Using `past_key_values` as a tuple is deprecated and will be removed in v4.45. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         batch_size, seq_length, _ = inputs_embeds.shape
         past_length = past_key_values.get_seq_length() if past_key_values is not None else 0
@@ -765,9 +769,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 46eea43e1285..be57838975c0 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -526,14 +526,18 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
 
-        use_legacy_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            if not self.training:
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
                 )
 
         seq_length = inputs_embeds.shape[1]
@@ -608,9 +612,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index ae84a9ec2d1a..cb1b3f885798 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -910,16 +910,19 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
-        if (
-            use_cache and not isinstance(past_key_values, Cache) and not self.training
-        ):  # kept for BC (non `Cache` `past_key_values` inputs)
+        if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 43bac44ba1be..7263713c0840 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1059,16 +1059,19 @@ def forward(
 
         inputs_embeds = nn.functional.dropout(inputs_embeds, p=self.emb_pdrop, training=self.training)
 
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
-        if (
-            use_cache and not isinstance(past_key_values, Cache) and not self.training
-        ):  # kept for BC (non `Cache` `past_key_values` inputs)
+        if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index 73e8806352eb..9a37fe22e177 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -1031,17 +1031,21 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
 
-        # Compute alibi tensor: check build_alibi_tensor documentation
-        use_legacy_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            if not self.training:
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
                 )
 
+        # Compute alibi tensor: check build_alibi_tensor documentation
         alibi = None
         past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
         batch_size, seq_length, _ = inputs_embeds.shape
@@ -1126,9 +1130,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(
diff --git a/src/transformers/models/gemma/diff_gemma.py b/src/transformers/models/gemma/diff_gemma.py
index fcdb0f0b3d7d..36f2d1c594ab 100644
--- a/src/transformers/models/gemma/diff_gemma.py
+++ b/src/transformers/models/gemma/diff_gemma.py
@@ -476,12 +476,19 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False  # noqa: F841
-        if (
-            use_cache and not isinstance(past_key_values, Cache) and not self.training
-        ):  # kept for BC (non `Cache` `past_key_values` inputs)
+        if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True  # noqa: F841
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index b14e0a4b3d8c..dd4c899d13d4 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -828,12 +828,19 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        return_legacy_cache = False  # noqa: F841
-        if (
-            use_cache and not isinstance(past_key_values, Cache) and not self.training
-        ):  # kept for BC (non `Cache` `past_key_values` inputs)
-            return_legacy_cache = True  # noqa: F841
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
@@ -856,15 +863,6 @@ def forward(
         # See https://github.com/huggingface/transformers/pull/29402
         normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
         hidden_states = hidden_states * normalizer
-        if (
-            use_cache and not isinstance(past_key_values, Cache) and not self.training
-        ):  # kept for BC (non `Cache` `past_key_values` inputs)
-            return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
 
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 43333e3d3338..7471eec811a0 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -417,14 +417,19 @@ def forward(
                 )
                 use_cache = False
 
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
@@ -463,9 +468,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 72590862b749..28309f7738eb 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -741,14 +741,18 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
 
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            if not self.training:
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
                 )
 
         seq_length = inputs_embeds.shape[1]
@@ -822,9 +826,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index e88302efa7bb..274c571fa893 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -943,14 +943,18 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_in(input_ids)
 
-        use_legacy_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            if not self.training:
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
                 )
 
         seq_length = inputs_embeds.shape[1]
@@ -1021,9 +1025,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attentions] if v is not None)
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index bf832195b4ef..048e108a8ec2 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -663,14 +663,18 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_in(input_ids)
 
-        use_legacy_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            if not self.training:
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
                 )
 
         seq_length = inputs_embeds.shape[1]
@@ -725,9 +729,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attentions] if v is not None)
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index bd7ce5696fa0..84f6d985f764 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -813,14 +813,18 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.wte(input_ids)
 
-        use_legacy_cache = False
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            if not self.training:
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
                 )
 
         seq_length = inputs_embeds.shape[1]
@@ -917,9 +921,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(
diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
index 876f5ed2a7c8..f62de411a4fa 100644
--- a/src/transformers/models/granite/modeling_granite.py
+++ b/src/transformers/models/granite/modeling_granite.py
@@ -834,14 +834,19 @@ def forward(
 
         inputs_embeds = inputs_embeds * self.embedding_multiplier
 
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache):  # kept for BC (non `Cache` `past_key_values` inputs)
+        if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
-            )
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index 0d26c4fc65de..1289bda2d0fd 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -1239,15 +1239,19 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
         if use_cache and not isinstance(past_key_values, Cache):
-            if not self.training:
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.45. "
-                    "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
                 )
-            return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
 
         batch_size, seq_length, _ = inputs_embeds.shape
         past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index 6108f0e8a42e..41be300095e7 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -1345,11 +1345,19 @@ def forward(
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         past_seen_tokens = 0
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
-        if use_cache:
-            if not isinstance(past_key_values, Cache):  # kept for BC (non `Cache` `past_key_values` inputs)
-                return_legacy_cache = True
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
             past_seen_tokens = past_key_values.get_seq_length()
 
         if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0:
diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
index c273b021d736..7c4394d0e1a1 100644
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -1033,12 +1033,19 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
-        if (
-            use_cache and not isinstance(past_key_values, Cache) and not self.training
-        ):  # kept for BC (non `Cache` `past_key_values` inputs)
+        if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index c7017832b932..0bc44f314b5e 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -944,16 +944,19 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
-        if (
-            use_cache and not isinstance(past_key_values, Cache) and not self.training
-        ):  # kept for BC (non `Cache` `past_key_values` inputs)
+        if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index ffe16b272033..80992734046a 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -762,14 +762,19 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index c7062e75b108..fcc0d66e19c4 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -1018,14 +1018,19 @@ def forward(
                 )
                 use_cache = False
 
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
@@ -1095,9 +1100,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index b4bda8e2db52..03fa524532a0 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -866,16 +866,19 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
-        if (
-            use_cache and not isinstance(past_key_values, Cache) and not self.training
-        ):  # kept for BC (non `Cache` `past_key_values` inputs)
+        if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py
index a33338365312..2cbde7dc8631 100644
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@@ -1007,14 +1007,19 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        # kept for BC (non `Cache` `past_key_values` inputs)
         return_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache):  # kept for BC (non `Cache` `past_key_values` inputs)
+        if use_cache and not isinstance(past_key_values, Cache):
             return_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
-            )
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index ccaa2c7fd29a..a6fd2284afb6 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -685,14 +685,19 @@ def forward(
                 )
                 use_cache = False
 
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
@@ -761,9 +766,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 648d1653a3b5..4d0a076b5f9a 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -976,14 +976,19 @@ def forward(
                 )
                 use_cache = False
 
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
@@ -1053,9 +1058,10 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index ec395679ae62..e0ca84be1848 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -1003,14 +1003,19 @@ def forward(
                 )
                 use_cache = False
 
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
@@ -1074,9 +1079,10 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index d0ea8ef0e376..aafecb95b6aa 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -915,14 +915,19 @@ def forward(
                 )
                 use_cache = False
 
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
@@ -991,9 +996,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index 6f483e50cde0..bc06b406bf43 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -1079,14 +1079,19 @@ def forward(
                 )
                 use_cache = False
 
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
@@ -1161,9 +1166,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index d91c0832ed33..13641ecb37f2 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -960,14 +960,19 @@ def forward(
                 )
                 use_cache = False
 
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
@@ -1036,9 +1041,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index 0be37c4e1fb9..5eaf50f090fa 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -889,14 +889,19 @@ def forward(
                 )
                 use_cache = False
 
-        use_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache) and not self.training:
-            use_legacy_cache = True
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            logger.warning_once(
-                "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.46. "
-                "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)"
-            )
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
@@ -966,9 +971,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)

From 4f0246e5359d2d9b7c9f1e5b7c3d7dc79fed0e99 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:10:22 +0200
Subject: [PATCH 50/67] fix tests with main revision and read token (#33560)

* fix tests with main revision and read token

* [run-slow]mamba2

* test previously skipped tests

* [run-slow]mamba2

* skip some tests

* [run-slow]mamba2

* finalize tests

* [run-slow]mamba2
---
 tests/models/mamba2/test_modeling_mamba2.py | 37 +++++++--------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py
index 276ecf2fd6b0..a1e2138d4d6d 100644
--- a/tests/models/mamba2/test_modeling_mamba2.py
+++ b/tests/models/mamba2/test_modeling_mamba2.py
@@ -20,7 +20,7 @@
 from parameterized import parameterized
 
 from transformers import AutoTokenizer, Mamba2Config, is_torch_available
-from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
+from transformers.testing_utils import require_read_token, require_torch, require_torch_gpu, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -96,7 +96,7 @@ def __init__(
         self.tie_word_embeddings = tie_word_embeddings
 
     def get_large_model_config(self):
-        return Mamba2Config.from_pretrained("revision='refs/pr/9'")
+        return Mamba2Config.from_pretrained("mistralai/Mamba-Codestral-7B-v0.1")
 
     def prepare_config_and_inputs(
         self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
@@ -199,34 +199,26 @@ def test_initialization(self):
     def test_tied_weights_keys(self):
         pass
 
-    @unittest.skip(reason="To fix, Mamba 2 cache slicing is interacting with beam search")
-    def test_beam_search_generate_dict_outputs_use_cache(self):
-        pass
-
-    @unittest.skip(reason="To fix, Mamba 2 cache slicing is interacting with beam search")
-    def test_beam_sample_generate(self):
+    @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case")
+    def test_generate_without_input_ids(self):
         pass
 
     @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case")
-    def test_generate_without_input_ids(self):
+    def test_generate_from_inputs_embeds_decoder_only(self):
         pass
 
     @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case")
     def test_greedy_generate_dict_outputs_use_cache(self):
         pass
 
-    @unittest.skip(reason="Initialization of mamba2 fails this")
-    def test_save_load_fast_init_from_base(self):
+    @unittest.skip(reason="To fix, Mamba 2 cache slicing is interacting with beam search")
+    def test_beam_search_generate_dict_outputs_use_cache(self):
         pass
 
     @unittest.skip(reason="A large mamba2 would be necessary (and costly) for that")
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case")
-    def test_generate_from_inputs_embeds_decoder_only(self):
-        pass
-
     def test_model_outputs_equivalence(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -292,12 +284,11 @@ def test_inputs_embeds_matches_input_ids_with_generate(self):
 
 @require_torch
 @slow
+@require_read_token
 class Mamba2IntegrationTest(unittest.TestCase):
     def setUp(self):
         self.model_id = "mistralai/Mamba-Codestral-7B-v0.1"
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_id, revision="refs/pr/9", from_slow=True, legacy=False
-        )
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, from_slow=True, legacy=False)
         self.prompt = ("[INST]Write a hello world program in C++.",)
 
     @parameterized.expand(
@@ -317,7 +308,7 @@ def test_simple_generate(self, device):
         tokenizer = self.tokenizer
         tokenizer.pad_token_id = tokenizer.eos_token_id
 
-        model = Mamba2ForCausalLM.from_pretrained(self.model_id, revision="refs/pr/9", torch_dtype=torch.bfloat16)
+        model = Mamba2ForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16)
         model.to(device)
         input_ids = tokenizer("[INST]Write a hello world program in C++.[/INST]", return_tensors="pt")["input_ids"].to(
             device
@@ -343,9 +334,7 @@ def test_batched_equivalence_with_cache(self):
             "[INST] Write a simple Fibonacci number computation function in Rust that does memoization, with comments, in safe Rust.[/INST]",
         ]
 
-        model = Mamba2ForCausalLM.from_pretrained(self.model_id, revision="refs/pr/9", torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = Mamba2ForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16).to(torch_device)
         tokenizer.pad_token_id = tokenizer.eos_token_id
         # batched generation
         tokenized_prompts = tokenizer(prompt, return_tensors="pt", padding="longest").to(torch_device)
@@ -375,9 +364,7 @@ def test_batched_equivalence_without_cache(self):
             "[INST] Write a simple Fibonacci number computation function in Rust that does memoization, with comments, in safe Rust.[/INST]",
         ]
 
-        model = Mamba2ForCausalLM.from_pretrained(self.model_id, revision="refs/pr/9", torch_dtype=torch.bfloat16).to(
-            torch_device
-        )
+        model = Mamba2ForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16).to(torch_device)
         tokenizer.pad_token_id = tokenizer.eos_token_id
         # batched generation
         tokenized_prompts = tokenizer(prompt, return_tensors="pt", padding="longest").to(torch_device)

From 413008c580bc00f5caab01179fdcd9ec5f11fd78 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:21:54 +0200
Subject: [PATCH 51/67] add uniform processors for altclip + chinese_clip
 (#31198)

* add initial design for uniform processors + align model

* add uniform processors for altclip + chinese_clip

* fix mutable default :eyes:

* add configuration test

* handle structured kwargs w defaults + add test

* protect torch-specific test

* fix style

* fix

* rebase

* update processor to generic kwargs + test

* fix style

* add sensible kwargs merge

* update test

* fix assertEqual

* move kwargs merging to processing common

* rework kwargs for type hinting

* just get Unpack from extensions

* run-slow[align]

* handle kwargs passed as nested dict

* add from_pretrained test for nested kwargs handling

* [run-slow]align

* update documentation + imports

* update audio inputs

* protect audio types, silly

* try removing imports

* make things simpler

* simplerer

* move out kwargs test to common mixin

* [run-slow]align

* skip tests for old processors

* [run-slow]align, clip

* !$#@!! protect imports, darn it

* [run-slow]align, clip

* [run-slow]align, clip

* update common processor testing

* add altclip

* add chinese_clip

* add pad_size

* [run-slow]align, clip, chinese_clip, altclip

* remove duplicated tests

* fix

* update doc

* improve documentation for default values

* add model_max_length testing

This parameter depends on tokenizers received.

* Raise if kwargs are specified in two places

* fix

* match defaults

* force padding

* fix tokenizer test

* clean defaults

* move tests to common

* remove try/catch block

* deprecate kwarg

* format

* add copyright + remove unused method

* [run-slow]altclip, chinese_clip

* clean imports

* fix version

* clean up deprecation

* fix style

* add corner case test on kwarg overlap

* resume processing - add Unpack as importable

* add tmpdirname

* fix altclip

* fix up

* add back crop_size to specific tests

* generalize tests to possible video_processor

* add back crop_size arg

* fixup overlapping kwargs test for qformer_tokenizer

* remove copied from

* fixup chinese_clip tests values

* fixup tests - qformer tokenizers

* [run-slow] altclip, chinese_clip

* remove prepare_image_inputs
---
 .../models/align/processing_align.py          |   7 +-
 .../models/altclip/processing_altclip.py      |  73 ++++----
 .../image_processing_chinese_clip.py          |   1 +
 .../chinese_clip/processing_chinese_clip.py   |  43 +++--
 src/transformers/processing_utils.py          |  14 +-
 .../models/altclip/test_processor_altclip.py  | 165 ++++++++++++++++++
 .../test_processor_chinese_clip.py            | 126 +++++++++++++
 .../test_processor_instructblip.py            |  28 +++
 .../test_processor_instructblipvideo.py       |  28 +++
 tests/test_processing_common.py               |  30 +++-
 10 files changed, 463 insertions(+), 52 deletions(-)
 create mode 100644 tests/models/altclip/test_processor_altclip.py

diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index a5846a87d236..7cfe14e52b44 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -18,16 +18,11 @@
 
 from typing import List, Union
 
-
-try:
-    from typing import Unpack
-except ImportError:
-    from typing_extensions import Unpack
-
 from ...image_utils import ImageInput
 from ...processing_utils import (
     ProcessingKwargs,
     ProcessorMixin,
+    Unpack,
 )
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 
diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 534349884283..153ecc2e2bfc 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -16,10 +16,16 @@
 Image/Text processor class for AltCLIP
 """
 
-import warnings
+from typing import List, Union
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...utils.deprecation import deprecate_kwarg
+
+
+class AltClipProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
 
 
 class AltCLIPProcessor(ProcessorMixin):
@@ -41,17 +47,8 @@ class AltCLIPProcessor(ProcessorMixin):
     image_processor_class = "CLIPImageProcessor"
     tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
 
-    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        feature_extractor = None
-        if "feature_extractor" in kwargs:
-            warnings.warn(
-                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
-                " instead.",
-                FutureWarning,
-            )
-            feature_extractor = kwargs.pop("feature_extractor")
-
-        image_processor = image_processor if image_processor is not None else feature_extractor
+    @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
+    def __init__(self, image_processor=None, tokenizer=None):
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
@@ -59,7 +56,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
 
         super().__init__(image_processor, tokenizer)
 
-    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[AltClipProcessorKwargs],
+    ) -> BatchEncoding:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
@@ -68,22 +72,20 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         of the above two methods for more information.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
+
+            images (`ImageInput`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
@@ -95,13 +97,24 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         """
 
         if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
+            raise ValueError("You must specify either text or images.")
 
-        if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+        if text is None and images is None:
+            raise ValueError("You must specify either text or images.")
+        output_kwargs = self._merge_kwargs(
+            AltClipProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
 
+        if text is not None:
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+        # BC for explicit return_tensors
+        if "return_tensors" in output_kwargs["common_kwargs"]:
+            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
 
         if text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values
diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
index 515c2de0cfc3..52349f84bffe 100644
--- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
@@ -231,6 +231,7 @@ def preprocess(
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
+
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
         size = get_size_dict(size, default_to_square=False)
diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py
index 1f44fc50aed5..2cfd314c6498 100644
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@@ -17,9 +17,15 @@
 """
 
 import warnings
+from typing import List, Union
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+
+
+class ChineseClipProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
 
 
 class ChineseCLIPProcessor(ProcessorMixin):
@@ -60,7 +66,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
         self.current_processor = self.image_processor
 
-    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[ChineseClipProcessorKwargs],
+    ) -> BatchEncoding:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -79,12 +92,10 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
 
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
@@ -97,12 +108,20 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
 
         if text is None and images is None:
             raise ValueError("You have to specify either text or images. Both cannot be none.")
+        output_kwargs = self._merge_kwargs(
+            ChineseClipProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
 
         if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
-
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+        # BC for explicit return_tensors
+        if "return_tensors" in output_kwargs["common_kwargs"]:
+            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
 
         if text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 8b33c456924d..53e83613a07c 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -20,11 +20,14 @@
 import inspect
 import json
 import os
+import sys
+import typing
 import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union
 
 import numpy as np
+import typing_extensions
 
 from .dynamic_module_utils import custom_object_save
 from .image_utils import ChannelDimension, is_valid_image, is_vision_available
@@ -67,6 +70,11 @@
     "AutoImageProcessor": "ImageProcessingMixin",
 }
 
+if sys.version_info >= (3, 11):
+    Unpack = typing.Unpack
+else:
+    Unpack = typing_extensions.Unpack
+
 
 class TextKwargs(TypedDict, total=False):
     """
@@ -151,6 +159,8 @@ class methods and docstrings.
             Standard deviation to use if normalizing the image.
         do_pad (`bool`, *optional*):
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the image.
         data_format (`ChannelDimension` or `str`, *optional*):
@@ -170,6 +180,7 @@ class methods and docstrings.
     image_mean: Optional[Union[float, List[float]]]
     image_std: Optional[Union[float, List[float]]]
     do_pad: Optional[bool]
+    pad_size: Optional[Dict[str, int]]
     do_center_crop: Optional[bool]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
@@ -814,7 +825,8 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                     # check if this key was passed as a flat kwarg.
                     if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
                         raise ValueError(
-                            f"Keyword argument {modality_key} was passed two times: in a dictionary for {modality} and as a **kwarg."
+                            f"Keyword argument {modality_key} was passed two times:\n"
+                            f"in a dictionary for {modality} and as a **kwarg."
                         )
                 elif modality_key in kwargs:
                     kwarg_value = kwargs.pop(modality_key, "__empty__")
diff --git a/tests/models/altclip/test_processor_altclip.py b/tests/models/altclip/test_processor_altclip.py
new file mode 100644
index 000000000000..1aca22809694
--- /dev/null
+++ b/tests/models/altclip/test_processor_altclip.py
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import tempfile
+import unittest
+
+from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import AltCLIPProcessor, CLIPImageProcessor
+
+
+@require_vision
+class AltClipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = AltCLIPProcessor
+
+    def setUp(self):
+        self.model_id = "BAAI/AltCLIP"
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = CLIPImageProcessor()
+        tokenizer = XLMRobertaTokenizer.from_pretrained(self.model_id)
+
+        processor = self.processor_class(image_processor, tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return XLMRobertaTokenizer.from_pretrained(self.model_id, **kwargs)
+
+    def get_rust_tokenizer(self, **kwargs):
+        return XLMRobertaTokenizerFast.from_pretrained(self.model_id, **kwargs)
+
+    def get_image_processor(self, **kwargs):
+        return CLIPImageProcessor.from_pretrained(self.model_id, **kwargs)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 7)
+
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
diff --git a/tests/models/chinese_clip/test_processor_chinese_clip.py b/tests/models/chinese_clip/test_processor_chinese_clip.py
index e433c38f7891..5b191ce2df08 100644
--- a/tests/models/chinese_clip/test_processor_chinese_clip.py
+++ b/tests/models/chinese_clip/test_processor_chinese_clip.py
@@ -206,3 +206,129 @@ def test_model_input_names(self):
         inputs = processor(text=input_str, images=image_input)
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
+
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 6)
+
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
+
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224])
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
diff --git a/tests/models/instructblip/test_processor_instructblip.py b/tests/models/instructblip/test_processor_instructblip.py
index cc929e3575a6..e03e555fed08 100644
--- a/tests/models/instructblip/test_processor_instructblip.py
+++ b/tests/models/instructblip/test_processor_instructblip.py
@@ -409,3 +409,31 @@ def test_structured_kwargs_nested_from_dict(self):
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    def test_overlapping_text_kwargs_handling(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_kwargs = {}
+        processor_kwargs["image_processor"] = self.get_component("image_processor")
+        processor_kwargs["tokenizer"] = tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        if "video_processor" in self.processor_class.attributes:
+            processor_kwargs["video_processor"] = self.get_component("video_processor")
+
+        qformer_tokenizer = self.get_component("qformer_tokenizer")
+
+        processor = self.processor_class(**processor_kwargs, qformer_tokenizer=qformer_tokenizer)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                images=image_input,
+                return_tensors="pt",
+                padding="max_length",
+                text_kwargs={"padding": "do_not_pad"},
+            )
diff --git a/tests/models/instructblipvideo/test_processor_instructblipvideo.py b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
index 945fe004d12e..8b29c7717592 100644
--- a/tests/models/instructblipvideo/test_processor_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
@@ -423,3 +423,31 @@ def test_structured_kwargs_nested_from_dict(self):
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    def test_overlapping_text_kwargs_handling(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_kwargs = {}
+        processor_kwargs["image_processor"] = self.get_component("image_processor")
+        processor_kwargs["tokenizer"] = tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        if "video_processor" in self.processor_class.attributes:
+            processor_kwargs["video_processor"] = self.get_component("video_processor")
+
+        qformer_tokenizer = self.get_component("qformer_tokenizer")
+
+        processor = self.processor_class(**processor_kwargs, qformer_tokenizer=qformer_tokenizer)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                images=image_input,
+                return_tensors="pt",
+                padding="max_length",
+                text_kwargs={"padding": "do_not_pad"},
+            )
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 306f3100fb8d..fe17de4eeb5c 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -146,7 +146,6 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
         self.assertEqual(len(inputs["input_ids"][0]), 117)
 
@@ -175,7 +174,6 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
         inputs = processor(
             text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
         )
@@ -238,7 +236,6 @@ def test_unstructured_kwargs_batched(self):
             padding="longest",
             max_length=76,
         )
-
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 6)
@@ -311,3 +308,30 @@ def test_structured_kwargs_nested_from_dict(self):
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    # TODO: the same test, but for audio + text processors that have strong overlap in kwargs
+    # TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication
+    def test_overlapping_text_kwargs_handling(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_kwargs = {}
+        processor_kwargs["image_processor"] = self.get_component("image_processor")
+        processor_kwargs["tokenizer"] = tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        if "video_processor" in self.processor_class.attributes:
+            processor_kwargs["video_processor"] = self.get_component("video_processor")
+        processor = self.processor_class(**processor_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                images=image_input,
+                return_tensors="pt",
+                padding="max_length",
+                text_kwargs={"padding": "do_not_pad"},
+            )

From d9d59e7baceb9acc1bc0d110de4c6aabbb7a4717 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 19 Sep 2024 16:23:17 +0100
Subject: [PATCH 52/67] Generate: check that `attention_mask` is 2D (#33575)

check attention mask in generate
---
 src/transformers/generation/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index a4cb000cf462..2ba330101e0c 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1864,6 +1864,10 @@ def generate(
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
                 inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor
             )
+        elif kwargs_has_attention_mask:
+            # TODO (joao): generalize this check with other types of inputs
+            if model_input_name == "input_ids" and len(model_kwargs["attention_mask"].shape) > 2:
+                raise ValueError("`attention_mask` passed to `generate` must be 2D.")
 
         if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
             # if model is encoder decoder encoder_outputs are created and added to `model_kwargs`

From 162056a3f433132fa6ce32c87663742803036b59 Mon Sep 17 00:00:00 2001
From: Vladislav Bronzov <58587565+VladOS95-cyber@users.noreply.github.com>
Date: Thu, 19 Sep 2024 18:35:44 +0200
Subject: [PATCH 53/67] =?UTF-8?q?change=20sequence=5Fbias=20type=20of=20Se?=
 =?UTF-8?q?quenceBiasLogitsProcessor=20to=20list,=20add=E2=80=A6=20(#33375?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* change sequence_bias type of SequenceBiasLogitsProcessor tp list, add config tests for all processors

* fix format

* small fix for all_token_bias_pairs_are_valid internal func

* small typo fix in description

* improve test impl, some SequenceBiasLogitsProcessor refactoring
---
 src/transformers/generation/logits_process.py |  53 ++-
 tests/generation/test_configuration_utils.py  | 449 +++++++++++++++++-
 2 files changed, 486 insertions(+), 16 deletions(-)

diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index c586a97459ac..d88c7a17d892 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -15,7 +15,7 @@
 
 import inspect
 import math
-from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Callable, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -1064,8 +1064,9 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
     </Tip>
 
     Args:
-        sequence_bias (`Dict[Tuple[int], float]`):
-            Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
+        sequence_bias (`List[List[Union[List[int], float]]]`):
+            List of lists that maps a sequence of tokens to its bias term (e.g. `[[[10, 45], -2.0],
+            [[64], -7.5]]`). Positive biases increase the odds of the
             sequence being selected, while negative biases do the opposite. If a sequence has a length of 1, its bias
             will always be applied. Otherwise, the bias will only be applied if the sequence in question is about to be
             completed (in the token selection step after this processor is applied).
@@ -1087,12 +1088,12 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
     >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("openai-community/gpt2", add_prefix_space=True)
 
 
-    >>> def get_tokens_as_tuple(word):
-    ...     return tuple(tokenizer_with_prefix_space([word], add_special_tokens=False).input_ids[0])
+    >>> def get_tokens(word):
+    ...     return tokenizer_with_prefix_space([word], add_special_tokens=False).input_ids[0]
 
 
     >>> # If we add a negative bias without beam search, it may become "stuck" in a prefix without good continuations
-    >>> sequence_bias = {get_tokens_as_tuple("Trump"): -10.0}
+    >>> sequence_bias = [get_tokens("Trump"), -10.0]
     >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, sequence_bias=sequence_bias)
     >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
     The full name of Donald is Donald J. Donald,
@@ -1102,16 +1103,17 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
     The full name of Donald is Donald Rumsfeld,
 
     >>> # We can also add a positive bias to nudge the model towards specific tokens or continuations
-    >>> sequence_bias = {get_tokens_as_tuple("Donald Duck"): 10.0}
+    >>> sequence_bias = [get_tokens("Donald Duck"), 10.0]
     >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, num_beams=4, sequence_bias=sequence_bias)
     >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
     The full name of Donald is Donald Duck.
     ```
     """
 
-    def __init__(self, sequence_bias: Dict[Tuple[int], float]):
+    def __init__(self, sequence_bias: List[List[Union[List[int], float]]]):
         self.sequence_bias = sequence_bias
         self._validate_arguments()
+        self._convert_list_arguments_into_dict()
 
         # Bias variables that will be populated on the first call (for retrocompatibility purposes, the vocabulary size
         # is infered in the first usage, which inhibits initializing here)
@@ -1178,11 +1180,15 @@ def _prepare_bias_variables(self, scores: torch.FloatTensor):
 
     def _validate_arguments(self):
         sequence_bias = self.sequence_bias
-        if not isinstance(sequence_bias, dict) or len(sequence_bias) == 0:
-            raise ValueError(f"`sequence_bias` has to be a non-empty dictionary, but is {sequence_bias}.")
-        if any(not isinstance(sequence_ids, tuple) for sequence_ids in sequence_bias.keys()):
+        if not isinstance(sequence_bias, dict) and not isinstance(sequence_bias, list) or len(sequence_bias) == 0:
+            raise ValueError(
+                f"`sequence_bias` has to be a non-empty dictionary, or non-empty list of lists but is {sequence_bias}."
+            )
+        if isinstance(sequence_bias, dict) and any(
+            not isinstance(sequence_ids, tuple) for sequence_ids in sequence_bias.keys()
+        ):
             raise ValueError(f"`sequence_bias` has to be a dict with tuples as keys, but is {sequence_bias}.")
-        if any(
+        if isinstance(sequence_bias, dict) and any(
             any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in sequence_ids)
             or len(sequence_ids) == 0
             for sequence_ids in sequence_bias.keys()
@@ -1191,9 +1197,30 @@ def _validate_arguments(self):
                 f"Each key in `sequence_bias` has to be a non-empty tuple of positive integers, but is "
                 f"{sequence_bias}."
             )
-        if any(not isinstance(bias, float) for bias in sequence_bias.values()):
+
+        def all_token_bias_pairs_are_valid(sequence):
+            return (
+                isinstance(sequence[0], list)
+                and all(isinstance(token_id, (int, np.integer)) and token_id > 0 for token_id in sequence[0])
+                and isinstance(sequence[1], float)
+            )
+
+        if isinstance(sequence_bias, list) and any(
+            (not all_token_bias_pairs_are_valid(sequence)) or len(sequence) == 0 for sequence in sequence_bias
+        ):
+            raise ValueError(
+                f"Each element in `sequence_bias` has to be a non-empty list of lists of positive integers and float, but is "
+                f"{sequence_bias}."
+            )
+        if isinstance(sequence_bias, dict) and any(not isinstance(bias, float) for bias in sequence_bias.values()):
             raise ValueError(f"`sequence_bias` has to be a dict with floats as values, but is {sequence_bias}.")
 
+    def _convert_list_arguments_into_dict(self):
+        """BC: we used to accept `dict{tuple of tokens: float}` directly, now we expect a list"""
+        if isinstance(self.sequence_bias, list):
+            temp_sequence = self.sequence_bias
+            self.sequence_bias = {tuple(sublist[0]): sublist[1] for sublist in temp_sequence}
+
 
 class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
     """
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index cd5f3d50162c..1e11a9679b25 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -23,9 +23,41 @@
 from huggingface_hub import HfFolder, delete_repo
 from parameterized import parameterized
 
-from transformers import AutoConfig, GenerationConfig
-from transformers.generation import GenerationMode
-from transformers.testing_utils import TOKEN, USER, is_staging_test
+from transformers import AutoConfig, GenerationConfig, WatermarkingConfig, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+from transformers.generation import (
+    ClassifierFreeGuidanceLogitsProcessor,
+    EncoderNoRepeatNGramLogitsProcessor,
+    EncoderRepetitionPenaltyLogitsProcessor,
+    EpsilonLogitsWarper,
+    EtaLogitsWarper,
+    ExponentialDecayLengthPenalty,
+    ForcedBOSTokenLogitsProcessor,
+    ForcedEOSTokenLogitsProcessor,
+    GenerationMode,
+    HammingDiversityLogitsProcessor,
+    MinLengthLogitsProcessor,
+    MinNewTokensLengthLogitsProcessor,
+    MinPLogitsWarper,
+    NoBadWordsLogitsProcessor,
+    NoRepeatNGramLogitsProcessor,
+    PrefixConstrainedLogitsProcessor,
+    RepetitionPenaltyLogitsProcessor,
+    SequenceBiasLogitsProcessor,
+    SuppressTokensAtBeginLogitsProcessor,
+    SuppressTokensLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+    TypicalLogitsWarper,
+    UnbatchedClassifierFreeGuidanceLogitsProcessor,
+    WatermarkLogitsProcessor,
+)
+from transformers.testing_utils import TOKEN, USER, is_staging_test, torch_device
 
 
 class GenerationConfigTest(unittest.TestCase):
@@ -225,6 +257,417 @@ def test_generation_mode(self):
         self.assertEqual(config.get_generation_mode(assistant_model="foo"), GenerationMode.ASSISTED_GENERATION)
 
 
+class GenerationConfigSerializationTest(unittest.TestCase):
+    def test_serialize_generation_sequence_bias(self):
+        """Tests that GenerationConfig is serialized and SequenceBiasLogitsProcessor is initialized with sequence_bias parameter"""
+        generation_config = GenerationConfig()
+        sequence_bias = [[[45, 67], -0.6], [[89], 1.2]]
+        generation_config.sequence_bias = sequence_bias
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertSequenceEqual(new_config.sequence_bias, sequence_bias)
+
+        expected_sequence_bias = {(45, 67): -0.6, (89,): 1.2}
+        bias_logits_processor = SequenceBiasLogitsProcessor(new_config.sequence_bias)
+        self.assertDictEqual(bias_logits_processor.sequence_bias, expected_sequence_bias)
+
+    def test_serialize_generation_min_length_eos_token(self):
+        """Tests that GenerationConfig is serialized and MinLengthLogitsProcessor is initialized with min_length and eos_token_id"""
+        eos_token_id = 0
+        min_length = 10
+
+        generation_config = GenerationConfig(min_length=min_length, eos_token_id=eos_token_id)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.min_length, min_length)
+        self.assertEqual(new_config.eos_token_id, eos_token_id)
+
+        min_dist_processor = MinLengthLogitsProcessor(
+            min_length=new_config.min_length, eos_token_id=new_config.eos_token_id
+        )
+        self.assertEqual(min_dist_processor.min_length, min_length)
+        self.assertEqual(min_dist_processor.eos_token_id, eos_token_id)
+
+    def test_serialize_generation_min_new_tokens(self):
+        """Tests that GenerationConfig is serialized and MinNewTokensLengthLogitsProcessor is initialized with min_new_tokens"""
+        eos_token_id = 0
+        min_new_tokens = 5
+        prompt_length_to_skip = 2
+
+        generation_config = GenerationConfig(min_new_tokens=min_new_tokens)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.min_new_tokens, min_new_tokens)
+
+        min_new_tokens_processor = MinNewTokensLengthLogitsProcessor(
+            prompt_length_to_skip=prompt_length_to_skip,
+            min_new_tokens=new_config.min_new_tokens,
+            eos_token_id=eos_token_id,
+        )
+        self.assertEqual(min_new_tokens_processor.min_new_tokens, min_new_tokens)
+
+    def test_serialize_generation_temperature(self):
+        """Tests that GenerationConfig is serialized and TemperatureLogitsWarper is initialized with temperature"""
+        temperature = 2.0
+
+        generation_config = GenerationConfig(temperature=temperature, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.temperature, temperature)
+
+        temperature_logits_warper = TemperatureLogitsWarper(temperature=new_config.temperature)
+        self.assertEqual(temperature_logits_warper.temperature, temperature)
+
+    def test_serialize_generation_repetition_penalty(self):
+        """Tests that GenerationConfig is serialized and RepetitionPenaltyLogitsProcessor is initialized with repetition_penalty"""
+        penalty = 2.0
+
+        generation_config = GenerationConfig(repetition_penalty=penalty)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.repetition_penalty, penalty)
+
+        rep_penalty_proc = RepetitionPenaltyLogitsProcessor(penalty=new_config.repetition_penalty)
+        self.assertEqual(rep_penalty_proc.penalty, penalty)
+
+    def test_serialize_generation_encoder_repetition_penalty(self):
+        """Tests that GenerationConfig is serialized and EncoderRepetitionPenaltyLogitsProcessor is initialized with penalty and input_ids"""
+        penalty = 2.0
+        input_ids = torch.tensor([[0, 1], [5, 0]], device=torch_device, dtype=torch.long)
+
+        generation_config = GenerationConfig(encoder_repetition_penalty=penalty)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.encoder_repetition_penalty, penalty)
+
+        rep_penalty_proc = EncoderRepetitionPenaltyLogitsProcessor(
+            penalty=new_config.encoder_repetition_penalty, encoder_input_ids=input_ids
+        )
+        self.assertEqual(rep_penalty_proc.penalty, 1 / penalty)
+        torch.testing.assert_close(rep_penalty_proc.encoder_input_ids, input_ids)
+
+    def test_serialize_generation_top_p(self):
+        """Tests that GenerationConfig is serialized and TopPLogitsWarper is initialized with top_p"""
+        top_p = 0.8
+
+        generation_config = GenerationConfig(top_p=top_p, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.top_p, top_p)
+
+        rep_penalty_proc = TopPLogitsWarper(top_p=new_config.top_p)
+        self.assertEqual(rep_penalty_proc.top_p, top_p)
+
+    def test_serialize_generation_top_k(self):
+        """Tests that GenerationConfig is serialized and TopKLogitsWarper is initialized with top_k"""
+        top_k = 2
+
+        generation_config = GenerationConfig(top_k=top_k, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.top_k, top_k)
+
+        top_k_logits_wrap = TopKLogitsWarper(top_k=new_config.top_k)
+        self.assertEqual(top_k_logits_wrap.top_k, top_k)
+
+    def test_serialize_generation_min_p(self):
+        """Tests that GenerationConfig is serialized and MinPLogitsWarper is initialized with min_p"""
+        min_p = 0.8
+
+        generation_config = GenerationConfig(min_p=min_p, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.min_p, min_p)
+
+        min_k_logits_wrap = MinPLogitsWarper(min_p=new_config.min_p)
+        self.assertEqual(min_k_logits_wrap.min_p, min_p)
+
+    def test_serialize_generation_typical_p(self):
+        """Tests that GenerationConfig is serialized and TypicalLogitsWarper is initialized with mass"""
+        mass = 0.8
+
+        generation_config = GenerationConfig(typical_p=mass, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.typical_p, mass)
+
+        typical_p_logits_wrap = TypicalLogitsWarper(mass=new_config.typical_p)
+        self.assertEqual(typical_p_logits_wrap.mass, mass)
+
+    def test_serialize_generation_epsilon_cutoff(self):
+        """Tests that GenerationConfig is serialized and EpsilonLogitsWarper is initialized with epsilon"""
+        epsilon = 0.8
+
+        generation_config = GenerationConfig(epsilon_cutoff=epsilon, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.epsilon_cutoff, epsilon)
+
+        epsilon_logits_wrap = EpsilonLogitsWarper(epsilon=new_config.epsilon_cutoff)
+        self.assertEqual(epsilon_logits_wrap.epsilon, epsilon)
+
+    def test_serialize_generation_eta_cutoff(self):
+        """Tests that GenerationConfig is serialized and EtaLogitsWarper is initialized with epsilon"""
+        epsilon = 0.8
+
+        generation_config = GenerationConfig(eta_cutoff=epsilon, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.eta_cutoff, epsilon)
+
+        eta_logits_wrap = EtaLogitsWarper(epsilon=new_config.eta_cutoff)
+        self.assertEqual(eta_logits_wrap.epsilon, epsilon)
+
+    def test_serialize_generation_ngram_size(self):
+        """Tests that GenerationConfig is serialized and NoRepeatNGramLogitsProcessor is initialized with ngram_size"""
+        ngram_size = 2
+
+        generation_config = GenerationConfig(no_repeat_ngram_size=ngram_size, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.no_repeat_ngram_size, ngram_size)
+
+        no_repeat_ngram_proc = NoRepeatNGramLogitsProcessor(ngram_size=new_config.no_repeat_ngram_size)
+        self.assertEqual(no_repeat_ngram_proc.ngram_size, ngram_size)
+
+    def test_serialize_generation_encoder_ngram_size(self):
+        """Tests that GenerationConfig is serialized and EncoderNoRepeatNGramLogitsProcessor is initialized with ngram_size"""
+        ngram_size = 2
+        input_ids = torch.tensor([[0, 1], [5, 0]], device=torch_device, dtype=torch.long)
+
+        generation_config = GenerationConfig(encoder_no_repeat_ngram_size=ngram_size, do_sample=True)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.encoder_no_repeat_ngram_size, ngram_size)
+
+        encoder_no_repeat_ngram_proc = EncoderNoRepeatNGramLogitsProcessor(
+            encoder_ngram_size=new_config.encoder_no_repeat_ngram_size, encoder_input_ids=input_ids
+        )
+        self.assertEqual(encoder_no_repeat_ngram_proc.ngram_size, ngram_size)
+
+    def test_serialize_generation_bad_words_ids(self):
+        """Tests that GenerationConfig is serialized and NoBadWordsLogitsProcessor is initialized with bad_words_ids"""
+        bad_word_tokens = [[1], [4], [1, 0], [0, 1, 2], [1, 3, 1, 3]]
+
+        generation_config = GenerationConfig(bad_words_ids=bad_word_tokens)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertSequenceEqual(new_config.bad_words_ids, bad_word_tokens)
+
+        no_bad_words_dist_proc = NoBadWordsLogitsProcessor(bad_words_ids=new_config.bad_words_ids)
+        self.assertSequenceEqual(no_bad_words_dist_proc.bad_word_ids, bad_word_tokens)
+
+    def test_serialize_generation_num_beams(self):
+        """Tests that GenerationConfig is serialized and PrefixConstrainedLogitsProcessor is initialized with num_beams"""
+        num_beams = 1
+
+        def prefix_allowed_tokens_fn(batch_id, inputs_ids):
+            return [[0, 1], [2, 3]][batch_id]
+
+        generation_config = GenerationConfig(num_beams=num_beams)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.num_beams, num_beams)
+
+        prefix_constrained_logits_proc = PrefixConstrainedLogitsProcessor(
+            prefix_allowed_tokens_fn, num_beams=new_config.num_beams
+        )
+        self.assertEqual(prefix_constrained_logits_proc._num_beams, num_beams)
+
+    def test_serialize_generation_diversity_penalty_and_num_bean_groups(self):
+        """Tests that GenerationConfig is serialized and HammingDiversityLogitsProcessor is initialized with diversity_penalty_and_num_bean_groups"""
+        num_beams = 2
+        num_beam_groups = 2
+        diversity_penalty = 1.0
+
+        generation_config = GenerationConfig(
+            num_beams=num_beams, diversity_penalty=diversity_penalty, num_beam_groups=num_beam_groups
+        )
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.num_beams, num_beams)
+        self.assertEqual(new_config.diversity_penalty, diversity_penalty)
+        self.assertEqual(new_config.num_beam_groups, num_beam_groups)
+
+        diversity_logits_processor = HammingDiversityLogitsProcessor(
+            diversity_penalty=new_config.diversity_penalty,
+            num_beams=new_config.num_beams,
+            num_beam_groups=new_config.num_beam_groups,
+        )
+        self.assertEqual(diversity_logits_processor._num_beams, num_beams)
+        self.assertEqual(diversity_logits_processor._diversity_penalty, diversity_penalty)
+        self.assertEqual(diversity_logits_processor._num_sub_beams, num_beams // num_beam_groups)
+
+    def test_serialize_generation_bos_token_id(self):
+        """Tests that GenerationConfig is serialized and ForcedBOSTokenLogitsProcessor is initialized with bos_token_id"""
+        bos_token_id = 0
+
+        generation_config = GenerationConfig(bos_token_id=bos_token_id)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.bos_token_id, bos_token_id)
+
+        logits_processor = ForcedBOSTokenLogitsProcessor(bos_token_id=new_config.bos_token_id)
+        self.assertEqual(logits_processor.bos_token_id, bos_token_id)
+
+    def test_serialize_generation_eos_token_id(self):
+        """Tests that GenerationConfig is serialized and ForcedEOSTokenLogitsProcessor is initialized with eos_token_id"""
+        eos_token_id = 0
+        max_length = 5
+
+        generation_config = GenerationConfig(eos_token_id=eos_token_id)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.eos_token_id, eos_token_id)
+
+        logits_processor = ForcedEOSTokenLogitsProcessor(
+            max_length=max_length, eos_token_id=new_config.eos_token_id, device=torch_device
+        )
+        self.assertEqual(logits_processor.eos_token_id, eos_token_id)
+
+    def test_serialize_generation_exponential_decay_length_penalty(self):
+        """Tests that GenerationConfig is serialized and ExponentialDecayLengthPenalty is initialized with regulation_start and regulation_factor"""
+        eos_token_id = 0
+        penalty_start = 5
+        penalty_factor = 1.1
+        input_ids_seq_length = 10
+        exponential_decay_length_penalty = (penalty_start, penalty_factor)
+
+        generation_config = GenerationConfig(exponential_decay_length_penalty=exponential_decay_length_penalty)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.exponential_decay_length_penalty, [penalty_start, penalty_factor])
+
+        exponential_decay_processor = ExponentialDecayLengthPenalty(
+            exponential_decay_length_penalty=new_config.exponential_decay_length_penalty,
+            eos_token_id=eos_token_id,
+            input_ids_seq_length=input_ids_seq_length,
+        )
+        self.assertEqual(
+            exponential_decay_processor.regulation_start, exponential_decay_length_penalty[0] + input_ids_seq_length
+        )
+        self.assertEqual(exponential_decay_processor.regulation_factor, exponential_decay_length_penalty[1])
+
+    def test_serialize_generation_begin_suppress_tokens(self):
+        """Tests that GenerationConfig is serialized and SuppressTokensAtBeginLogitsProcessor is initialized with begin_suppress_token and begin_index"""
+
+        begin_suppress_tokens = [220, 50256]
+        begin_index = 0
+        generation_config = GenerationConfig(begin_suppress_tokens=begin_suppress_tokens)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertSequenceEqual(new_config.begin_suppress_tokens, begin_suppress_tokens)
+
+        suppress_processor = SuppressTokensAtBeginLogitsProcessor(
+            begin_suppress_tokens=new_config.begin_suppress_tokens, begin_index=begin_index
+        )
+        self.assertSequenceEqual(suppress_processor.begin_suppress_tokens, begin_suppress_tokens)
+        self.assertEqual(suppress_processor.begin_index, begin_index)
+
+    def test_serialize_generation_suppress_tokens(self):
+        """Tests that GenerationConfig is serialized and SuppressTokensLogitsProcessor is initialized with suppress_token"""
+        suppress_tokens = [220, 50256]
+
+        generation_config = GenerationConfig(suppress_tokens=suppress_tokens)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertSequenceEqual(new_config.suppress_tokens, suppress_tokens)
+
+        suppress_processor = SuppressTokensLogitsProcessor(suppress_tokens=new_config.suppress_tokens)
+        self.assertSequenceEqual(suppress_processor.suppress_tokens, suppress_tokens)
+
+    def test_serialize_generation_guidance_scale(self):
+        """Tests that GenerationConfig is serialized and ClassifierFreeGuidanceLogitsProcessor is initialized with guidance_scale"""
+        guidance_scale = 2.0
+        generation_config = GenerationConfig(guidance_scale=guidance_scale)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.guidance_scale, guidance_scale)
+
+        classifier_processor = ClassifierFreeGuidanceLogitsProcessor(guidance_scale=new_config.guidance_scale)
+        self.assertEqual(classifier_processor.guidance_scale, guidance_scale)
+
+    def test_serialize_generation_guidance_scale_unbatched(self):
+        """Tests that GenerationConfig is serialized and UnbatchedClassifierFreeGuidanceLogitsProcessor is initialized with guidance_scale"""
+        guidance_scale = 2.0
+
+        input_ids = torch.LongTensor([[0]])
+
+        generation_config = GenerationConfig(guidance_scale=guidance_scale)
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.guidance_scale, guidance_scale)
+
+        cfg = UnbatchedClassifierFreeGuidanceLogitsProcessor(new_config.guidance_scale, {}, input_ids)
+        self.assertEqual(cfg.guidance_scale, guidance_scale)
+
+    def test_serialize_generation_watermarking_config(self):
+        """Tests that GenerationConfig is serialized and WatermarkLogitsProcessor is initialized with WatermarkingConfig parameters"""
+
+        vocab_size = 20
+        bias = 2.0
+        greenlist_ratio = 0.5
+        hashing_key = 10
+        seeding_scheme = "lefthash"
+        context_width = 10
+        watermarking_config = WatermarkingConfig(
+            bias=bias,
+            greenlist_ratio=greenlist_ratio,
+            hashing_key=hashing_key,
+            seeding_scheme=seeding_scheme,
+            context_width=context_width,
+        )
+        generation_config = GenerationConfig(watermarking_config=watermarking_config)
+
+        with tempfile.TemporaryDirectory("test-generation-config") as tmp_dir:
+            generation_config.save_pretrained(tmp_dir)
+            new_config = GenerationConfig.from_pretrained(tmp_dir)
+        self.assertEqual(new_config.watermarking_config.bias, bias)
+        self.assertEqual(new_config.watermarking_config.greenlist_ratio, greenlist_ratio)
+        self.assertEqual(new_config.watermarking_config.hashing_key, hashing_key)
+        self.assertEqual(new_config.watermarking_config.seeding_scheme, seeding_scheme)
+        self.assertEqual(new_config.watermarking_config.context_width, context_width)
+
+        watermark = WatermarkLogitsProcessor(
+            vocab_size=vocab_size,
+            device=torch_device,
+            greenlist_ratio=new_config.watermarking_config.greenlist_ratio,
+            bias=new_config.watermarking_config.bias,
+            hashing_key=new_config.watermarking_config.hashing_key,
+            seeding_scheme=new_config.watermarking_config.seeding_scheme,
+            context_width=new_config.watermarking_config.context_width,
+        )
+        self.assertEqual(watermark.bias, bias)
+        self.assertEqual(watermark.greenlist_size, int(vocab_size * greenlist_ratio))
+        self.assertEqual(watermark.hash_key, hashing_key)
+        self.assertEqual(watermark.seeding_scheme, seeding_scheme)
+        self.assertEqual(watermark.context_width, context_width)
+
+
 @is_staging_test
 class ConfigPushToHubTester(unittest.TestCase):
     @classmethod

From b50ff5993a5d8b2a3d8c7558e81684f8803b044a Mon Sep 17 00:00:00 2001
From: Anton Vlasjuk <73884904+vasqu@users.noreply.github.com>
Date: Thu, 19 Sep 2024 18:41:17 +0200
Subject: [PATCH 54/67] [`Mamba2`] Move dt calculations to kernel (#33520)

* use kernel for dt calculations

* add small test

* [run-slow] mamba2
---
 .../models/mamba2/modeling_mamba2.py          |  3 ++-
 tests/models/mamba2/test_modeling_mamba2.py   | 26 ++++++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py
index 69390ea9ad2b..7b414ff9570d 100644
--- a/src/transformers/models/mamba2/modeling_mamba2.py
+++ b/src/transformers/models/mamba2/modeling_mamba2.py
@@ -358,7 +358,6 @@ def cuda_kernels_forward(
                     dim=-1,
                 )
 
-                time_step = nn.functional.softplus(time_step + self.dt_bias)
                 # 1D Convolution
                 if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
                     hidden_states_B_C = self.act(
@@ -391,6 +390,8 @@ def cuda_kernels_forward(
                     z=None,
                     seq_idx=None,
                     return_final_states=True,
+                    dt_bias=self.dt_bias,
+                    dt_softplus=True,
                     **dt_limit_kwargs,
                 )
                 if ssm_state is not None and cache_params is not None:
diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py
index a1e2138d4d6d..55c18abe6b96 100644
--- a/tests/models/mamba2/test_modeling_mamba2.py
+++ b/tests/models/mamba2/test_modeling_mamba2.py
@@ -35,7 +35,7 @@
         Mamba2ForCausalLM,
         Mamba2Model,
     )
-    from transformers.models.mamba2.modeling_mamba2 import Mamba2Cache
+    from transformers.models.mamba2.modeling_mamba2 import Mamba2Cache, Mamba2Mixer
     from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_0
 else:
     is_torch_greater_or_equal_than_2_0 = False
@@ -378,3 +378,27 @@ def test_batched_equivalence_without_cache(self):
             individual_gen = model.generate(**inputs, max_new_tokens=30, use_cache=True)
             individual_output = tokenizer.batch_decode(individual_gen, skip_special_tokens=True)[0]
             self.assertEqual(individual_output[:100], batched_output[index_gen][:100])
+
+    @slow
+    @require_torch_gpu
+    def test_mamba2_mixer_train_vs_eval_equivalence(self):
+        # Based on https://github.com/sustcsonglin/flash-linear-attention/issues/63
+        # Credit to zhixuan-lin
+
+        B, T, D = 4, 512, 768
+        dtype = torch.bfloat16
+        config = Mamba2Config(num_heads=24, head_dim=64, hidden_size=768, expand=2, n_groups=1)
+
+        torch.manual_seed(42)
+        with torch.amp.autocast(device_type="cuda", dtype=dtype):
+            with torch.no_grad():
+                mixer = Mamba2Mixer(config, layer_idx=0).to("cuda")
+                hidden_states = torch.rand(size=(B, T, D), dtype=dtype, device="cuda")
+
+                mixer.train()
+                out_train = mixer(hidden_states)
+
+                mixer.eval()
+                out_eval = mixer(hidden_states)
+
+                self.assertTrue(torch.allclose(out_train, out_eval, atol=1e-3))

From 52920b5dd5ad3b5e94209ef392ab5ceccbb1c869 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 19 Sep 2024 17:42:47 +0100
Subject: [PATCH 55/67] Cache: don't throw warnings on `gemma2` when
 instantiating a new cache (#33595)

---
 src/transformers/cache_utils.py               | 10 ++++-
 .../models/gemma2/modeling_gemma2.py          | 41 +++++++------------
 src/transformers/models/mimi/modeling_mimi.py | 12 +++++-
 tests/models/gemma2/test_modeling_gemma2.py   |  5 +++
 4 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index 0671157e4470..d42b15c14abf 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -1660,7 +1660,15 @@ def get_max_length(self) -> Optional[int]:
         return self.max_cache_len
 
     def get_seq_length(self, layer_idx: Optional[int] = 0):
-        return None
+        # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's
+        # limit the check to the first batch member and head dimension.
+        # TODO: deprecate this function in favor of `cache_position`
+        if layer_idx != 0:
+            raise ValueError(
+                "`get_seq_length` on `HybridCache` may get inconsistent results depending on the layer index. "
+                "Using the `layer_idx` argument is not supported."
+            )
+        return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
 
     def reset(self):
         """Resets the cache values while preserving the objects"""
diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py
index 1909ef785015..be964c9aed01 100644
--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@@ -710,20 +710,13 @@ def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+        past_key_values (`HybridCache`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
+            Gemma 2 uses a unique cache class, [`HybridCache`], and does not guarantee full compatibility with other
+            cache classes.
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -789,7 +782,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[HybridCache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
@@ -818,19 +811,8 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        if cache_position is None:
-            if past_key_values is None:
-                cache_position = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device)
-            else:
-                raise ValueError("When `past_key_values` is passed, `cache_position` must be too")
-
-        # Probably a forward call with caching, so we set up cache for one call only
-        if use_cache and past_key_values is None and not self.training:
-            logger.warning_once(
-                "You are calling the model with `use_cache=True` but didn't pass `past_key_values` while not training. ",
-                "If you want to compute with cache, make sure to pass an instance of `HybridCache`. An empty `HybridCache` instance "
-                "will be created for this call. See for more: (https://huggingface.co/docs/transformers/main/en/internal/generation_utils#transformers.HybridCache)",
-            )
+        # Instantiate an empty cache if needed.
+        if use_cache and past_key_values is None:
             batch_size, seq_len, _ = inputs_embeds.shape
             past_key_values = HybridCache(
                 self.config,
@@ -840,6 +822,11 @@ def forward(
                 dtype=inputs_embeds.dtype,
             )
 
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
@@ -912,7 +899,7 @@ def _update_causal_mask(
         attention_mask: torch.Tensor,
         input_tensor: torch.Tensor,
         cache_position: torch.Tensor,
-        past_key_values: Cache,
+        past_key_values: HybridCache,
         output_attentions: bool,
     ):
         # Flash Attention currently doesn't support static cache but Gemma2 work only with static cache.
@@ -981,7 +968,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[HybridCache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -1202,7 +1189,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        past_key_values: Optional[HybridCache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py
index db36250b3d89..d91b057ef28e 100644
--- a/src/transformers/models/mimi/modeling_mimi.py
+++ b/src/transformers/models/mimi/modeling_mimi.py
@@ -1000,8 +1000,16 @@ def forward(
             )
             use_cache = False
 
-        if use_cache and past_key_values is None and not self.training:
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        if use_cache and not isinstance(past_key_values, Cache):
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py
index 918ed847f83d..4e7b3553460f 100644
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@@ -86,10 +86,15 @@ def setUp(self):
     def test_model_outputs_equivalence(self, **kwargs):
         pass
 
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different")
     def test_eager_matches_sdpa_inference(self):
         pass
 
+    @unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different")
+    def test_eager_matches_sdpa_generate(self):
+        pass
+
     @parameterized.expand([("random",), ("same",)])
     @unittest.skip("Gemma2 has HybridCache which is not compatible with assisted decoding")
     def test_assisted_decoding_matches_greedy_search(self, assistant_type):

From f111d5b783a658b4089e5c82a104fbab397225cd Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Thu, 19 Sep 2024 14:14:06 -0400
Subject: [PATCH 56/67] Uniformize kwargs for Paligemma processor and update
 docs (#33571)

* Uniformize paligemma processor

* nit
---
 docs/source/en/model_doc/paligemma.md         |   4 +-
 .../models/paligemma/modeling_paligemma.py    |   2 +-
 .../models/paligemma/processing_paligemma.py  | 114 ++++++++----------
 .../paligemma/test_modeling_paligemma.py      |  18 +--
 .../paligemma/test_processor_paligemma.py     |  89 ++++++++++++++
 5 files changed, 153 insertions(+), 74 deletions(-)
 create mode 100644 tests/models/paligemma/test_processor_paligemma.py

diff --git a/docs/source/en/model_doc/paligemma.md b/docs/source/en/model_doc/paligemma.md
index 48debe593f97..41d785bba29d 100644
--- a/docs/source/en/model_doc/paligemma.md
+++ b/docs/source/en/model_doc/paligemma.md
@@ -41,7 +41,7 @@ processor = AutoProcessor.from_pretrained(model_id)
 prompt = "What is on the flower?"
 image_file = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg?download=true"
 raw_image = Image.open(requests.get(image_file, stream=True).raw)
-inputs = processor(prompt, raw_image, return_tensors="pt")
+inputs = processor(raw_image, prompt, return_tensors="pt")
 output = model.generate(**inputs, max_new_tokens=20)
 
 print(processor.decode(output[0], skip_special_tokens=True)[len(prompt):])
@@ -53,7 +53,7 @@ print(processor.decode(output[0], skip_special_tokens=True)[len(prompt):])
 ```python
 prompt = "What is on the flower?"
 answer = "a bee"
-inputs = processor(text=prompt, images=raw_image, suffix=answer, return_tensors="pt")
+inputs = processor(images=raw_image, text=prompt, suffix=answer, return_tensors="pt")
 ```
 
 ## Resources
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 4e456b9f08e2..48fffb6b428d 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -443,7 +443,7 @@ def forward(
         >>> url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
 
         >>> # Generate
         >>> generate_ids = model.generate(**inputs, max_length=30)
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index 3d0ece60c367..4457b6fe957b 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -21,15 +21,19 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image
-from ...processing_utils import ProcessorMixin
+from ...processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    TextKwargs,
+    Unpack,
+    _validate_images_text_input_order,
+)
 from ...tokenization_utils_base import (
     AddedToken,
-    PaddingStrategy,
     PreTokenizedInput,
     TextInput,
-    TruncationStrategy,
 )
-from ...utils import TensorType
 
 
 logger = logging.getLogger(__name__)
@@ -38,6 +42,27 @@
 EXTRA_TOKENS = [f"<loc{i:0>4}>" for i in range(1024)] + [f"<seg{i:0>3}>" for i in range(128)]
 
 
+class PaliGemmaTextKwargs(TextKwargs):
+    suffix: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
+
+
+class PaliGemmaImagesKwargs(ImagesKwargs):
+    do_convert_rgb: Optional[bool]
+
+
+class PaliGemmaProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: PaliGemmaTextKwargs
+    images_kwargs: PaliGemmaImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            "data_format": "channels_first",
+        },
+    }
+
+
 # Copied from transformers.models.idefics2.processing_idefics2.is_url
 def is_url(val) -> bool:
     return isinstance(val, str) and val.startswith("http")
@@ -122,27 +147,11 @@ def __init__(
 
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         images: ImageInput = None,
-        tokenize_newline_separately: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length=None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
-        do_resize: bool = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        data_format: Optional["ChannelDimension"] = "channels_first",  # noqa: F821
-        input_data_format: Optional[
-            Union[str, "ChannelDimension"]  # noqa: F821
-        ] = None,
-        resample: "PILImageResampling" = None,  # noqa: F821
-        do_convert_rgb: bool = None,
-        do_thumbnail: bool = None,
-        do_align_long_axis: bool = None,
-        do_rescale: bool = None,
-        suffix: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[PaliGemmaProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -171,29 +180,14 @@ def __call__(
 
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
-            tokenize_newline_separately (`bool`, defaults to `True`):
-                Adds a separately tokenized '\n' at the end of the prompt.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             return_tensors (`str` or [`~utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
@@ -216,6 +210,15 @@ def __call__(
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
             - **labels** -- Labels compatible with training if `suffix` is not None
         """
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        output_kwargs = self._merge_kwargs(
+            PaliGemmaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        suffix = output_kwargs["text_kwargs"].pop("suffix", None)
 
         return_token_type_ids = True if suffix is not None else False
 
@@ -251,30 +254,17 @@ def __call__(
             for prompt in text
         ]
 
-        pixel_values = self.image_processor(
-            images,
-            do_resize=do_resize,
-            do_normalize=do_normalize,
-            return_tensors=return_tensors,
-            image_mean=image_mean,
-            image_std=image_std,
-            input_data_format=input_data_format,
-            data_format=data_format,
-            resample=resample,
-            do_convert_rgb=do_convert_rgb,
-        )["pixel_values"]
-
-        if max_length is not None:
-            max_length += self.image_seq_length  # max_length has to account for the image tokens
+        pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+
+        # max_length has to account for the image tokens
+        if output_kwargs["text_kwargs"].get("max_length", None) is not None:
+            output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length
 
         inputs = self.tokenizer(
             input_strings,
             text_pair=suffix,
-            return_tensors=return_tensors,
-            padding=padding,
-            max_length=max_length,
-            truncation=truncation,
             return_token_type_ids=return_token_type_ids,
+            **output_kwargs["text_kwargs"],
         )
 
         return_data = {**inputs, "pixel_values": pixel_values}
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index d592205443e1..3918292133b4 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -337,7 +337,7 @@ def test_small_model_integration_test(self):
             "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
         )
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt")
+        inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt")
         EXPECTED_INPUT_IDS = torch.tensor([[257152] * 256 + [2, 108]])
         self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
 
@@ -360,7 +360,7 @@ def test_small_model_integration_test_paligemma_VQA(self):
             "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
         )
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt").to(torch.float16)
+        inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(torch.float16)
 
         output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
         EXPECTED_DECODED_TEXT = "answer en Where is the cow standing?\nbeach"  # fmt: skip
@@ -382,7 +382,7 @@ def test_small_model_integration_test_paligemma_empty_prompt(self):
             "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
         )
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt").to(torch.float16)
+        inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt").to(torch.float16)
 
         output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
         EXPECTED_DECODED_TEXT = "\ncow on the beach"  # fmt: skip
@@ -412,7 +412,7 @@ def test_small_model_integration_test_paligemma_batched(self):
         )
         image2 = image1
 
-        inputs = self.processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
+        inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
 
         output = model.generate(**inputs, max_new_tokens=20)
 
@@ -443,7 +443,7 @@ def test_small_model_integration_test_paligemma_batched_bf16(self):
         image2 = image1
 
         inputs = (
-            self.processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
+            self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
             .to(torch.bfloat16)
             .to(torch_device)
         )
@@ -475,7 +475,7 @@ def test_small_model_integration_test_paligemma_batched_f16(self):
         image2 = image1
 
         inputs = (
-            self.processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
+            self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
             .to(torch.float16)
             .to(torch_device)
         )
@@ -504,7 +504,7 @@ def test_integration_detection_bug(self):
             ).raw
         )
 
-        inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(torch.bfloat16).to(torch_device)
+        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(torch.bfloat16).to(torch_device)
 
         output = model.generate(**inputs, max_new_tokens=20)
 
@@ -528,8 +528,8 @@ def test_paligemma_index_error_bug(self):
 
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
         inputs = self.processor(
-            text=prompt,
             images=raw_image,
+            text=prompt,
             return_tensors="pt",
         ).to(torch.float16)
 
@@ -561,7 +561,7 @@ def test_paligemma_finetuning_with_suffixes_bf16(self):
         image2 = image1
 
         inputs = (
-            self.processor(text=prompts, suffix=suffixes, images=[image1, image2], return_tensors="pt", padding=True)
+            self.processor(images=[image1, image2], text=prompts, suffix=suffixes, return_tensors="pt", padding=True)
             .to(torch.bfloat16)
             .to(torch_device)
         )
diff --git a/tests/models/paligemma/test_processor_paligemma.py b/tests/models/paligemma/test_processor_paligemma.py
new file mode 100644
index 000000000000..47810f183241
--- /dev/null
+++ b/tests/models/paligemma/test_processor_paligemma.py
@@ -0,0 +1,89 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+
+from transformers import GemmaTokenizer
+from transformers.testing_utils import get_tests_dir, require_torch, require_vision
+from transformers.utils import is_vision_available
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+if is_vision_available():
+    from transformers import (
+        PaliGemmaProcessor,
+        SiglipImageProcessor,
+        is_vision_available,
+    )
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+
+
+@require_vision
+class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = PaliGemmaProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+        image_processor.image_seq_length = 0
+        tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
+        processor.save_pretrained(self.tmpdirname)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    @require_torch
+    @require_vision
+    def test_image_seq_length(self):
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=112, padding="max_length")
+        image_processor.image_seq_length = 14
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        inputs = processor(
+            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
+        )
+        self.assertEqual(len(inputs["input_ids"][0]), 112 + 14)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 10)

From b87755aa6d06253436b9451b9ea0bc1f448733f3 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Fri, 20 Sep 2024 02:28:04 +0800
Subject: [PATCH 57/67] [tests] skip tests for xpu  (#33553)

* enable

* fix

* add xpu skip

* add marker

* skip for xpu

* add more

* add one more
---
 src/transformers/testing_utils.py                   |  7 +++++++
 tests/extended/test_trainer_ext.py                  |  2 ++
 tests/models/layoutlmv2/test_modeling_layoutlmv2.py | 10 +++++++++-
 tests/test_modeling_common.py                       |  3 +++
 tests/trainer/test_trainer.py                       |  3 +++
 tests/utils/test_cache_utils.py                     |  2 ++
 6 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 3306f76249fe..b86e3af91ca7 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -849,6 +849,13 @@ def require_torch_xpu(test_case):
     return unittest.skipUnless(is_torch_xpu_available(), "test requires XPU device")(test_case)
 
 
+def require_non_xpu(test_case):
+    """
+    Decorator marking a test that should be skipped for XPU.
+    """
+    return unittest.skipUnless(torch_device != "xpu", "test requires a non-XPU")(test_case)
+
+
 def require_torch_multi_xpu(test_case):
     """
     Decorator marking a test that requires a multi-XPU setup (in PyTorch). These tests are skipped on a machine without
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index 9bf34c366927..4f6cf7dffa1e 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -31,6 +31,7 @@
     get_torch_dist_unique_port,
     require_apex,
     require_bitsandbytes,
+    require_non_xpu,
     require_torch,
     require_torch_gpu,
     require_torch_multi_accelerator,
@@ -106,6 +107,7 @@ def test_run_seq2seq_dp(self):
     def test_run_seq2seq_ddp(self):
         self.run_seq2seq_quick(distributed=True)
 
+    @require_non_xpu
     @require_apex
     @require_torch_gpu
     def test_run_seq2seq_apex(self):
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 3d366fe3e84e..94cc4e95432c 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -16,7 +16,14 @@
 
 import unittest
 
-from transformers.testing_utils import require_detectron2, require_torch, require_torch_multi_gpu, slow, torch_device
+from transformers.testing_utils import (
+    require_detectron2,
+    require_non_xpu,
+    require_torch,
+    require_torch_multi_gpu,
+    slow,
+    torch_device,
+)
 from transformers.utils import is_detectron2_available, is_torch_available
 
 from ...test_configuration_common import ConfigTester
@@ -251,6 +258,7 @@ def prepare_config_and_inputs_for_common(self):
         return config, inputs_dict
 
 
+@require_non_xpu
 @require_torch
 @require_detectron2
 class LayoutLMv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 8eab385ad48a..1ad6e93b10ff 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -76,6 +76,7 @@
     require_accelerate,
     require_bitsandbytes,
     require_flash_attn,
+    require_non_xpu,
     require_read_token,
     require_safetensors,
     require_torch,
@@ -2884,6 +2885,7 @@ def test_inputs_embeds_matches_input_ids_with_generate(self):
                 )
             self.assertTrue(torch.allclose(out_embeds, out_ids))
 
+    @require_non_xpu
     @require_torch_multi_gpu
     def test_multi_gpu_data_parallel_forward(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -4118,6 +4120,7 @@ def test_sdpa_can_dispatch_on_flash(self):
                 with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
                     _ = model(**inputs_dict)
 
+    @require_non_xpu
     @require_torch_sdpa
     @require_torch_accelerator
     @slow
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index c5f8b6169fcf..14014e4a0947 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -66,6 +66,7 @@
     require_intel_extension_for_pytorch,
     require_liger_kernel,
     require_lomo,
+    require_non_xpu,
     require_optuna,
     require_peft,
     require_ray,
@@ -884,6 +885,7 @@ def test_mixed_bf16(self):
 
         # will add more specific tests once there are some bugs to fix
 
+    @require_non_xpu
     @require_torch_gpu
     @require_torch_tf32
     def test_tf32(self):
@@ -3196,6 +3198,7 @@ def test_fp16_full_eval(self):
         # perfect world: fp32_init/2 == fp16_eval
         self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)
 
+    @require_non_xpu
     @require_torch_non_multi_gpu
     @require_torchdynamo
     @require_torch_tensorrt_fx
diff --git a/tests/utils/test_cache_utils.py b/tests/utils/test_cache_utils.py
index 6ab821231fd5..3e8c80de2d16 100644
--- a/tests/utils/test_cache_utils.py
+++ b/tests/utils/test_cache_utils.py
@@ -22,6 +22,7 @@
 from transformers.testing_utils import (
     is_torch_available,
     require_auto_gptq,
+    require_non_xpu,
     require_read_token,
     require_torch,
     require_torch_gpu,
@@ -317,6 +318,7 @@ def test_hybrid_cache_n_sequences(self):
         ]
         self.assertListEqual(decoded, expected_text)
 
+    @require_non_xpu
     @require_auto_gptq
     def test_sink_cache_hard(self):
         tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ")

From 4d8908df272c0a9db2e5fbcc8aaed73cdf75442a Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli.lin@intel.com>
Date: Fri, 20 Sep 2024 02:39:19 +0800
Subject: [PATCH 58/67] [tests] enable GemmaIntegrationTest on XPU  (#33555)

enable GemmaIntegrationTest
---
 tests/models/gemma/test_modeling_gemma.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index b564d51216d2..a02541d58544 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -528,7 +528,7 @@ def test_flash_attn_2_equivalence(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class GemmaIntegrationTest(unittest.TestCase):
     input_text = ["Hello I am doing", "Hi today"]
     # This variable is used to determine which CUDA device are we using for our runners (A10 or T4)
@@ -748,7 +748,6 @@ def test_model_7b_bf16(self):
 
         output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
         output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
         self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version])
 
     @require_read_token
@@ -770,10 +769,8 @@ def test_model_7b_fp16_static_cache(self):
 
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device)
-
         output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
         output_text = tokenizer.batch_decode(output, skip_special_tokens=True)
-
         self.assertEqual(output_text, EXPECTED_TEXTS)
 
     @require_bitsandbytes

From 0c718f16d1e8b73ac637529f6328fd8cb378ce7e Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Fri, 20 Sep 2024 01:28:33 +0200
Subject: [PATCH 59/67] Fix Llama 3 TikToken conversion (#33538)

* Fix Llama 3 TikToken conversion

* No need to add tokens again
---
 src/transformers/models/llama/convert_llama_weights_to_hf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
index a75ce5245eee..99aa198bf62c 100644
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
@@ -332,7 +332,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim):
 
 class Llama3Converter(TikTokenConverter):
     def __init__(self, vocab_file, special_tokens=None, instruct=False, model_max_length=None, **kwargs):
-        super().__init__(vocab_file, **kwargs)
+        super().__init__(vocab_file, additional_special_tokens=special_tokens, **kwargs)
         tokenizer = self.converted()
         chat_template = (
             "{% set loop_messages = messages %}"
@@ -345,7 +345,6 @@ def __init__(self, vocab_file, special_tokens=None, instruct=False, model_max_le
             "{% endfor %}"
             "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
         )
-        tokenizer.add_special_tokens(special_tokens)
 
         self.tokenizer = PreTrainedTokenizerFast(
             tokenizer_object=tokenizer,

From bdf4649f6722776cc3f68f86ed64ac7556ea5efb Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Fri, 20 Sep 2024 09:37:39 +0100
Subject: [PATCH 60/67] Docs: add the ability to manually trigger jobs (#33598)

---
 .github/workflows/build_documentation.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index b25567fb092a..c55638ded149 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -1,6 +1,7 @@
 name: Build documentation
 
 on:
+  workflow_dispatch:
   push:
     branches:
       - main

From 6dc364616d58b9fdc70f88cd2c136e741fc28d26 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 20 Sep 2024 10:57:21 +0200
Subject: [PATCH 61/67] Fix CircleCI nightly run (#33558)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/config.yml   | 53 +++++++++++++++++++++++++++++++-----------
 utils/tests_fetcher.py |  9 ++++---
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 483e315e260c..0c03538f07db 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -82,22 +82,49 @@ jobs:
         parallelism: 1
         steps:
             - checkout
-            - run: uv pip install -e .
-            - run: |
-                  mkdir test_preparation
-                  echo -n "tests" > test_preparation/test_list.txt
-                  echo -n "all" > test_preparation/examples_test_list.txt
-                  echo -n "tests/repo_utils" > test_preparation/test_repo_utils.txt
+            - run: uv pip install -U -e .
+            - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
+            - run: mkdir -p test_preparation
+            - run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt
+            - run: python utils/tests_fetcher.py --filter_tests
+            - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
             - run: |
-                  echo -n "tests" > test_list.txt
-                  python utils/tests_fetcher.py --filter_tests
-                  mv test_list.txt test_preparation/filtered_test_list.txt
-            - run: python .circleci/create_circleci_config.py --fetcher_folder test_preparation
-            - run: cp test_preparation/generated_config.yml test_preparation/generated_config.txt
+                if [ ! -s test_preparation/generated_config.yml ]; then
+                    echo "No tests to run, exiting early!"
+                    circleci-agent step halt
+                fi
+
             - store_artifacts:
-                  path: test_preparation/generated_config.txt
+                path: test_preparation
+
+            - run:
+                name: "Retrieve Artifact Paths"
+                env:
+                    CIRCLE_TOKEN: ${{ secrets.CI_ARTIFACT_TOKEN }}
+                command: |
+                    project_slug="gh/${CIRCLE_PROJECT_USERNAME}/${CIRCLE_PROJECT_REPONAME}"
+                    job_number=${CIRCLE_BUILD_NUM}
+                    url="https://circleci.com/api/v2/project/${project_slug}/${job_number}/artifacts"
+                    curl -o  test_preparation/artifacts.json ${url}
+            - run:
+                name: "Prepare pipeline parameters"
+                command: |
+                    python utils/process_test_artifacts.py 
+
+            # To avoid too long generated_config.yaml on the continuation orb, we pass the links to the artifacts as parameters.
+            # Otherwise the list of tests was just too big. Explicit is good but for that it was a limitation.
+            # We used:
+
+            # https://circleci.com/docs/api/v2/index.html#operation/getJobArtifacts : to get the job artifacts
+            # We could not pass a nested dict, which is why we create the test_file_... parameters for every single job
+
+            - store_artifacts:
+                path: test_preparation/transformed_artifacts.json
+            - store_artifacts:
+                path: test_preparation/artifacts.json
             - continuation/continue:
-                  configuration_path: test_preparation/generated_config.yml
+                parameters:  test_preparation/transformed_artifacts.json
+                configuration_path: test_preparation/generated_config.yml
 
     check_code_quality:
         working_directory: ~/transformers
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 8ccd738c1df6..1aa663e967c9 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -1193,9 +1193,9 @@ def create_test_list_from_filter(full_test_list, out_path):
         default=None,
     )
     parser.add_argument(
-        "--commit_message",
-        type=str,
-        help="The commit message (which could contain a command to force all tests or skip the CI).",
+        "--fetch_all",
+        action="store_true",
+        help="Will fetch all tests.",
         default=None,
     )
     args = parser.parse_args()
@@ -1212,6 +1212,9 @@ def create_test_list_from_filter(full_test_list, out_path):
             quit()
         if commit_flags["no_filter"]:
             print("Running all tests fetched without filtering.")
+
+        if args.fetch_all:
+            commit_flags["test_all"] = True
         if commit_flags["test_all"]:
             print("Force-launching all tests")
 

From 31650a53a1d7f80470d2ba8010966c2cab97eec8 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 20 Sep 2024 11:00:34 +0200
Subject: [PATCH 62/67] Allow CI could be run on private forked repositories
 (e.g. new model additions) (#33594)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .circleci/config.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0c03538f07db..9932156aa969 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -47,13 +47,13 @@ jobs:
 
             - run:
                 name: "Retrieve Artifact Paths"
-                env:
-                    CIRCLE_TOKEN: ${{ secrets.CI_ARTIFACT_TOKEN }}
+                # [reference] https://circleci.com/docs/api/v2/index.html#operation/getJobArtifacts
+                # `CIRCLE_TOKEN` is defined as an environment variables set within a context, see `https://circleci.com/docs/contexts/`
                 command: |
                     project_slug="gh/${CIRCLE_PROJECT_USERNAME}/${CIRCLE_PROJECT_REPONAME}"
                     job_number=${CIRCLE_BUILD_NUM}
                     url="https://circleci.com/api/v2/project/${project_slug}/${job_number}/artifacts"
-                    curl -o  test_preparation/artifacts.json ${url}
+                    curl -o test_preparation/artifacts.json ${url} --header "Circle-Token: $CIRCLE_TOKEN"
             - run:
                 name: "Prepare pipeline parameters"
                 command: |
@@ -190,7 +190,10 @@ workflows:
             - check_circleci_user
             - check_code_quality
             - check_repository_consistency
-            - fetch_tests
+            - fetch_tests:
+                # [reference] https://circleci.com/docs/contexts/
+                context:
+                    - TRANSFORMERS_CONTEXT
 
     nightly:
         when: <<pipeline.parameters.nightly>>

From 8bd1f2f33888ec8a86cecd6b9d448d8e26940b63 Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli0116@gmail.com>
Date: Fri, 20 Sep 2024 17:16:43 +0800
Subject: [PATCH 63/67] [tests] make more tests device-agnostic (#33580)

* enable

* fix

* add xpu skip

* add marker

* skip for xpu

* add more

* enable on accelerator

* add more cases

* add more tests

* add more
---
 .../grounding_dino/test_modeling_grounding_dino.py       | 8 ++++----
 tests/models/llama/test_modeling_llama.py                | 6 ++++--
 tests/models/mistral/test_modeling_mistral.py            | 5 +++--
 .../recurrent_gemma/test_modeling_recurrent_gemma.py     | 4 ++--
 tests/models/univnet/test_modeling_univnet.py            | 7 ++++---
 tests/models/whisper/test_modeling_whisper.py            | 9 ++++++---
 tests/test_modeling_common.py                            | 2 +-
 7 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 1b4970785aa6..c6e9671dd59a 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -30,7 +30,7 @@
 from transformers.testing_utils import (
     require_timm,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     require_vision,
     slow,
     torch_device,
@@ -676,7 +676,7 @@ def test_inference_object_detection_head(self):
         self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2))
         self.assertListEqual(results["labels"], expected_labels)
 
-    @require_torch_gpu
+    @require_torch_accelerator
     def test_inference_object_detection_head_equivalence_cpu_gpu(self):
         processor = self.default_processor
         image = prepare_img()
@@ -690,8 +690,8 @@ def test_inference_object_detection_head_equivalence_cpu_gpu(self):
             cpu_outputs = model(**encoding)
 
         # 2. run model on GPU
-        model.to("cuda")
-        encoding = encoding.to("cuda")
+        model.to(torch_device)
+        encoding = encoding.to(torch_device)
         with torch.no_grad():
             gpu_outputs = model(**encoding)
 
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index c99357ff99b2..a21665c822f2 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -24,10 +24,12 @@
 
 from transformers import AutoTokenizer, LlamaConfig, StaticCache, is_torch_available, set_seed
 from transformers.testing_utils import (
+    backend_empty_cache,
     require_bitsandbytes,
     require_flash_attn,
     require_read_token,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     require_torch_sdpa,
     slow,
@@ -899,11 +901,11 @@ def test_compile_static_cache(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class Mask4DTestHard(unittest.TestCase):
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def setUp(self):
         model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index 0da7ae72add7..0730f8ba4441 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -29,6 +29,7 @@
     require_flash_attn,
     require_read_token,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     require_torch_sdpa,
     slow,
@@ -719,14 +720,14 @@ def test_compile_static_cache(self):
 
 
 @slow
-@require_torch_gpu
+@require_torch_accelerator
 class Mask4DTestHard(unittest.TestCase):
     model_name = "mistralai/Mistral-7B-v0.1"
     _model = None
 
     def tearDown(self):
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     @property
     def model(self):
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index 1a58ee2970d8..23dace68cf21 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -21,7 +21,7 @@
     require_bitsandbytes,
     require_read_token,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -418,7 +418,7 @@ def test_inputs_embeds_matches_input_ids_with_generate(self):
         pass
 
 
-@require_torch_gpu
+@require_torch_accelerator
 @slow
 class RecurrentGemmaIntegrationTest(unittest.TestCase):
     input_text = ["Hello I am doing", "Hi today"]
diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py
index e160c799b786..f26a423a1a2f 100644
--- a/tests/models/univnet/test_modeling_univnet.py
+++ b/tests/models/univnet/test_modeling_univnet.py
@@ -21,9 +21,10 @@
 
 from transformers import UnivNetConfig, UnivNetFeatureExtractor
 from transformers.testing_utils import (
+    backend_empty_cache,
     is_torch_available,
     require_torch,
-    require_torch_gpu,
+    require_torch_accelerator,
     slow,
     torch_device,
 )
@@ -207,13 +208,13 @@ def test_unbatched_inputs_outputs(self):
             self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1")
 
 
-@require_torch_gpu
+@require_torch_accelerator
 @slow
 class UnivNetModelIntegrationTests(unittest.TestCase):
     def tearDown(self):
         super().tearDown()
         gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)
 
     def _load_datasamples(self, num_samples, sampling_rate=24000):
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 70b38d3bf381..4bcf4252a60d 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -34,10 +34,12 @@
     is_flaky,
     is_pt_flax_cross_test,
     require_flash_attn,
+    require_non_xpu,
     require_torch,
+    require_torch_accelerator,
     require_torch_fp16,
     require_torch_gpu,
-    require_torch_multi_gpu,
+    require_torch_multi_accelerator,
     require_torchaudio,
     slow,
     torch_device,
@@ -2612,6 +2614,7 @@ def test_generate_with_prompt_ids_and_no_non_prompt_forced_decoder_ids(self):
 
         self.assertTrue(prompt in text)
 
+    @require_non_xpu
     @slow
     @require_torch_gpu
     def test_speculative_decoding_distil(self):
@@ -3239,7 +3242,7 @@ def test_whisper_longform_no_speech_detection(self):
         for i in range(num_samples):
             assert decoded_all[i] == EXPECTED_TEXT[i]
 
-    @require_torch_gpu
+    @require_torch_accelerator
     @slow
     def test_whisper_empty_longform(self):
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
@@ -3278,7 +3281,7 @@ def test_whisper_empty_longform(self):
         torch.manual_seed(0)
         model.generate(**inputs, **gen_kwargs)
 
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     @slow
     def test_whisper_empty_longform_multi_gpu(self):
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 1ad6e93b10ff..d55399a951c9 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -4751,7 +4751,7 @@ def test_static_cache_matches_dynamic(self):
 
     # For now, Let's focus only on GPU for `torch.compile`
     @slow
-    @require_torch_gpu
+    @require_torch_accelerator
     @require_read_token
     def test_torch_compile(self):
         if version.parse(torch.__version__) < version.parse("2.3"):

From ec1424c6a3cc91a3fa2570bbeb9c4431072b873b Mon Sep 17 00:00:00 2001
From: Lake Lee <101966044+klae01@users.noreply.github.com>
Date: Fri, 20 Sep 2024 19:40:57 +0900
Subject: [PATCH 64/67] Update modeling_mamba2.py, fix pad size (#32599)

* Update modeling_mamba2.py

Fix pad_size calculation to ensure it's less than self.chunk_size

* [run_slow] mamba2

* [run-slow] mamba2

* [run-slow] Add @require_read_token decorator to failing tests for token propagation

* [run_slow] mamba2
---
 src/transformers/models/mamba2/modeling_mamba2.py | 2 +-
 tests/models/mamba2/test_modeling_mamba2.py       | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py
index 7b414ff9570d..19d53437130e 100644
--- a/src/transformers/models/mamba2/modeling_mamba2.py
+++ b/src/transformers/models/mamba2/modeling_mamba2.py
@@ -510,7 +510,7 @@ def torch_forward(self, input_states, cache_params: Optional[Mamba2Cache]=None,
             C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
             B = B.repeat(1, 1, self.num_heads // self.n_groups, 1)
             C = C.repeat(1, 1, self.num_heads // self.n_groups, 1)
-            pad_size = self.chunk_size - (seq_len % self.chunk_size)
+            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
 
             D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
 
diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py
index 55c18abe6b96..f19358a22f4b 100644
--- a/tests/models/mamba2/test_modeling_mamba2.py
+++ b/tests/models/mamba2/test_modeling_mamba2.py
@@ -291,6 +291,7 @@ def setUp(self):
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, from_slow=True, legacy=False)
         self.prompt = ("[INST]Write a hello world program in C++.",)
 
+    @require_read_token
     @parameterized.expand(
         [
             (torch_device,),
@@ -319,6 +320,7 @@ def test_simple_generate(self, device):
         ground_truth_sentence = """<s>[INST]Write a hello world program in C++.[/INST] Sure, here is a simple "Hello, World!" program in C++:\n\n```cpp\n#include <iostream>\n\n"""
         self.assertEqual(output_sentence, ground_truth_sentence)
 
+    @require_read_token
     @slow
     @require_torch_gpu
     def test_batched_equivalence_with_cache(self):
@@ -349,6 +351,7 @@ def test_batched_equivalence_with_cache(self):
             individual_output = tokenizer.batch_decode(individual_gen, skip_special_tokens=True)[0]
             self.assertEqual(individual_output[:100], batched_output[index_gen][:100])
 
+    @require_read_token
     @slow
     @require_torch_gpu
     def test_batched_equivalence_without_cache(self):

From 266d0a6375802075111ffb5e89e1b2a643ea44b8 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Fri, 20 Sep 2024 13:50:42 +0100
Subject: [PATCH 65/67] Generate: remove flakyness in
 `test_generate_from_inputs_embeds_decoder_only` (#33602)

almost zero is not zero
---
 tests/generation/test_utils.py | 40 ++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 413e920609a8..26ece9c25d06 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1647,26 +1647,42 @@ def test_generate_from_inputs_embeds_decoder_only(self):
                 continue
 
             # Traditional way of generating text
-            outputs_from_ids = model.generate(input_ids, max_new_tokens=5)
-            self.assertEqual(outputs_from_ids.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
+            outputs_from_ids = model.generate(
+                input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
+            )
+            self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
 
             # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
             inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(input_ids, inputs_embeds=inputs_embeds, max_new_tokens=5)
-            self.assertListEqual(outputs_from_ids.tolist(), outputs_from_embeds.tolist())
+            outputs_from_embeds = model.generate(
+                input_ids,
+                inputs_embeds=inputs_embeds,
+                max_new_tokens=5,
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+            self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist())
 
-            # But if we pass different inputs_embeds, we should get different outputs
-            torch.manual_seed(0)
+            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
+            # same, but the logits will almost surely be different)
             random_embeds = torch.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(input_ids, inputs_embeds=random_embeds, max_new_tokens=5)
-            with self.assertRaises(AssertionError):
-                self.assertListEqual(outputs_from_rand_embeds.tolist(), outputs_from_embeds.tolist())
+            outputs_from_rand_embeds = model.generate(
+                input_ids,
+                inputs_embeds=random_embeds,
+                max_new_tokens=5,
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+            for i in range(len(outputs_from_rand_embeds.scores)):
+                self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
 
             # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
-            outputs_from_embeds_wo_ids = model.generate(inputs_embeds=inputs_embeds, max_new_tokens=5)
+            outputs_from_embeds_wo_ids = model.generate(
+                inputs_embeds=inputs_embeds, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
+            )
             self.assertListEqual(
-                outputs_from_embeds[:, inputs_embeds.shape[1] :].tolist(),
-                outputs_from_embeds_wo_ids.tolist(),
+                outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(),
+                outputs_from_embeds_wo_ids.sequences.tolist(),
             )
 
     @pytest.mark.generate

From f9b44097261270377199bb0fcbceb01fd9f7d0c0 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 20 Sep 2024 14:20:57 +0100
Subject: [PATCH 66/67] Remove unnecessary CPM model tests (#33621)

Remove model tests
---
 tests/models/cpm/test_tokenization_cpm.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/models/cpm/test_tokenization_cpm.py b/tests/models/cpm/test_tokenization_cpm.py
index fa69a6aaa7dc..152671eb6807 100644
--- a/tests/models/cpm/test_tokenization_cpm.py
+++ b/tests/models/cpm/test_tokenization_cpm.py
@@ -12,15 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
 
 from transformers.models.cpm.tokenization_cpm import CpmTokenizer
 from transformers.testing_utils import custom_tokenizers
 
-from ..xlnet.test_modeling_xlnet import XLNetModelTest
-
 
 @custom_tokenizers
-class CpmTokenizationTest(XLNetModelTest):
+class CpmTokenizationTest(unittest.TestCase):
     # There is no `CpmModel`
     def is_pipeline_test_to_skip(
         self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name

From 653eb40425344b89b5a24e7b07eb3095b04cdc9d Mon Sep 17 00:00:00 2001
From: Omar Salman <omar.salman@arbisoft.com>
Date: Fri, 20 Sep 2024 18:27:32 +0500
Subject: [PATCH 67/67] Add sdpa for BioGpt (#33592)

* Add sdpa for BioGpt

* Updates

* Add the docs

* [run_slow] biogpt

* Use the copy mechanism to ensure consistency

* [run_slow] biogpt
---
 docs/source/en/model_doc/biogpt.md            |  45 ++++++
 docs/source/en/perf_infer_gpu_one.md          |   1 +
 .../models/biogpt/modeling_biogpt.py          | 133 +++++++++++++++++-
 3 files changed, 174 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/model_doc/biogpt.md b/docs/source/en/model_doc/biogpt.md
index 20a8e4d9cd30..7d0943d5393d 100644
--- a/docs/source/en/model_doc/biogpt.md
+++ b/docs/source/en/model_doc/biogpt.md
@@ -32,6 +32,51 @@ This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The
 - BioGPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next token in a sequence. Leveraging this feature allows BioGPT to generate syntactically coherent text as it can be observed in the run_generation.py example script.
 - The model can take the `past_key_values` (for PyTorch) as input, which is the previously computed key/value attention pairs. Using this (past_key_values or past) value prevents the model from re-computing pre-computed values in the context of text generation. For PyTorch, see past_key_values argument of the BioGptForCausalLM.forward() method for more information on its usage.
 
+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import BioGptForCausalLM
+model = BioGptForCausalLM.from_pretrained("microsoft/biogpt", attn_implementation="sdpa", torch_dtype=torch.float16)
+```
+
+On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.3.1, OS Ubuntu 20.04) with `float16` and `microsoft/biogpt` model with a CausalLM head,
+we saw the following speedups during training.
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+| num_training_steps | batch_size | seq_len | is cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | sdpa peak mem (MB) | Mem saving (%) |
+|--------------------|------------|---------|---------|----------------------------|---------------------------|-------------|---------------------|--------------------|----------------|
+| 100                | 1          | 128     | False   | 0.038                      | 0.031                     | 21.301      | 1601.862            | 1601.497           | 0.023          |
+| 100                | 1          | 256     | False   | 0.039                      | 0.034                     | 15.084      | 1624.944            | 1625.296           | -0.022         |
+| 100                | 2          | 128     | False   | 0.039                      | 0.033                     | 16.820      | 1624.567            | 1625.296           | -0.045         |
+| 100                | 2          | 256     | False   | 0.065                      | 0.059                     | 10.255      | 1672.164            | 1672.164           | 0.000          |
+| 100                | 4          | 128     | False   | 0.062                      | 0.058                     | 6.998       | 1671.435            | 1672.164           | -0.044         |
+| 100                | 4          | 256     | False   | 0.113                      | 0.100                     | 13.316      | 2350.179            | 1848.435           | 27.144         |
+| 100                | 8          | 128     | False   | 0.107                      | 0.098                     | 9.883       | 2098.521            | 1848.435           | 13.530         |
+| 100                | 8          | 256     | False   | 0.222                      | 0.196                     | 13.413      | 3989.980            | 2986.492           | 33.601         |
+
+On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.3.1, OS Ubuntu 20.04) with `float16` and `microsoft/biogpt` model with a simple AutoModel head,
+we saw the following speedups during inference.
+
+| num_batches | batch_size | seq_len | is cuda | is half | use mask | Per token latency eager (ms) | Per token latency SDPA (ms) | Speedup (%) | Mem eager (MB) | Mem BT (MB) | Mem saved (%) |
+|-------------|------------|---------|---------|---------|----------|------------------------------|-----------------------------|-------------|----------------|--------------|---------------|
+| 50          | 1          | 64      | True    | True    | True     | 0.115                        | 0.098                       | 17.392      | 716.998        | 716.998      | 0.000         |
+| 50          | 1          | 128     | True    | True    | True     | 0.115                        | 0.093                       | 24.640      | 730.916        | 730.916      | 0.000         |
+| 50          | 2          | 64      | True    | True    | True     | 0.114                        | 0.096                       | 19.204      | 730.900        | 730.900      | 0.000         |
+| 50          | 2          | 128     | True    | True    | True     | 0.117                        | 0.095                       | 23.529      | 759.262        | 759.262      | 0.000         |
+| 50          | 4          | 64      | True    | True    | True     | 0.113                        | 0.096                       | 18.325      | 759.229        | 759.229      | 0.000         |
+| 50          | 4          | 128     | True    | True    | True     | 0.186                        | 0.178                       | 4.289       | 816.478        | 816.478      | 0.000         |
+
+
 ## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 4c220dd0f148..73ae4d5c0c92 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -208,6 +208,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer#transformers.ASTModel)
 * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
 * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)
+* [BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt#transformers.BioGptModel)
 * [CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertModel)
 * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index 020f52833d5b..16f7aab5c3df 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -23,7 +23,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -244,16 +244,130 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+# Copied from transformers.models.bart.modeling_bart.BartSdpaAttention with Bart->BioGpt
+class BioGptSdpaAttention(BioGptAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions or layer_head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "BioGptModel is using BioGptSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                key_value_states=key_value_states,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+            )
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        query_states = self._shape(query_states, tgt_len, bsz)
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
+        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+BIOGPT_ATTENTION_CLASSES = {
+    "eager": BioGptAttention,
+    "sdpa": BioGptSdpaAttention,
+}
+
+
 class BioGptDecoderLayer(nn.Module):
     def __init__(self, config: BioGptConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
 
-        self.self_attn = BioGptAttention(
+        self.self_attn = BIOGPT_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.num_attention_heads,
             dropout=config.attention_probs_dropout_prob,
             is_decoder=True,
+            is_causal=True,
         )
         self.dropout = config.hidden_dropout_prob
         self.activation_fn = ACT2FN[config.hidden_act]
@@ -337,6 +451,7 @@ class BioGptPreTrainedModel(PreTrainedModel):
     config_class = BioGptConfig
     base_model_prefix = "biogpt"
     supports_gradient_checkpointing = True
+    _supports_sdpa = True
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -444,6 +559,7 @@ def __init__(self, config: BioGptConfig):
         self.layer_norm = nn.LayerNorm(self.embed_dim)
 
         self.gradient_checkpointing = False
+        self._use_sdpa = config._attn_implementation == "sdpa"
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -511,9 +627,16 @@ def forward(
         # embed positions
         positions = self.embed_positions(attention_mask, past_key_values_length)
 
-        attention_mask = _prepare_4d_causal_attention_mask(
-            attention_mask, input_shape, inputs_embeds, past_key_values_length
-        )
+        if self._use_sdpa and not output_attentions and head_mask is None:
+            # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
+        else:
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
 
         hidden_states = inputs_embeds + positions