From 2f4163afdf72f2e57e36c28c6c1f3d77056c86d0 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Fri, 16 Aug 2024 15:38:35 +0800
Subject: [PATCH 01/25] uniformize kwargs of Chameleon

---
 .../models/chameleon/processing_chameleon.py  | 81 ++++++++++---------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index 1480808336d14e..2cac2d4bcb986a 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -16,13 +16,36 @@
 Processor class for Chameleon.
 """
 
+import sys
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class ChameleonTextKwargs(TextKwargs, total=False):
+    return_for_text_completion: bool
+
+
+class ChameleonProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: ChameleonTextKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "stride": 0,
+            "return_for_text_completion": False,
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+        },
+    }
 
 
 class ChameleonProcessor(ProcessorMixin):
@@ -57,13 +80,9 @@ def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, ima
 
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        images: ImageInput = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: int = None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
-        return_for_text_completion: bool = False,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        images: Optional[ImageInput] = None,
+        **kwargs: Unpack[ChameleonProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -80,26 +99,6 @@ def __call__(
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -114,6 +113,15 @@ def __call__(
             text = [text]
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+        if text is None and images is None:
+            raise ValueError("You must provide either text or images")
+
+        output_kwargs = self._merge_kwargs(
+            ChameleonProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        return_for_text_completion = output_kwargs["text_kwargs"].pop("return_for_text_completion", False)
 
         # Replace the image token with the expanded image token sequence
         prompt_strings = []
@@ -124,19 +132,12 @@ def __call__(
                 sample += self.tokenizer.sep_token  # special Chameleon treatment to add sep for chat mode
             prompt_strings.append(sample)
 
-        data = self.tokenizer(
-            prompt_strings,
-            return_tensors=return_tensors,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-        )
+        data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
 
         if images is not None:
-            pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"]
-            data["pixel_values"] = pixel_values
+            data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
 
-        return BatchFeature(data=data, tensor_type=return_tensors)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):

From 258814458d5fdbf3282a86b46c4a794ca9ccb8b4 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Fri, 16 Aug 2024 15:41:52 +0800
Subject: [PATCH 02/25] fix linter nit

---
 src/transformers/models/chameleon/processing_chameleon.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index 2cac2d4bcb986a..a039101f56d71e 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -24,6 +24,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
+
 if sys.version_info >= (3, 11):
     from typing import Unpack
 else:

From 6454130ad1b278beaa682c5311e30020c013013c Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Fri, 16 Aug 2024 17:38:16 +0800
Subject: [PATCH 03/25] rm stride default

---
 src/transformers/models/chameleon/processing_chameleon.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index a039101f56d71e..d999267ef1fa9c 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -40,7 +40,6 @@ class ChameleonProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {
         "text_kwargs": {
             "padding": False,
-            "stride": 0,
             "return_for_text_completion": False,
         },
         "common_kwargs": {

From 9949e722b40261e7d088070e2dec7034220807be Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Fri, 16 Aug 2024 17:43:30 +0800
Subject: [PATCH 04/25] add tests for chameleon processor

---
 .../models/chameleon/test_processor_chameleon.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 tests/models/chameleon/test_processor_chameleon.py

diff --git a/tests/models/chameleon/test_processor_chameleon.py b/tests/models/chameleon/test_processor_chameleon.py
new file mode 100644
index 00000000000000..74314e3d4c1e95
--- /dev/null
+++ b/tests/models/chameleon/test_processor_chameleon.py
@@ -0,0 +1,16 @@
+import tempfile
+import unittest
+
+from transformers import ChameleonProcessor
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "leloy/Anole-7b-v0.1-hf"
+    processor_class = ChameleonProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)

From 58c6b53661a1d6111710d37f02ea7a489410e0f1 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Fri, 16 Aug 2024 18:07:33 +0800
Subject: [PATCH 05/25] fix tests

---
 .../chameleon/test_processor_chameleon.py       | 17 +++++++++++++++++
 tests/test_processing_common.py                 |  7 +++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/tests/models/chameleon/test_processor_chameleon.py b/tests/models/chameleon/test_processor_chameleon.py
index 74314e3d4c1e95..1efeaa5339d304 100644
--- a/tests/models/chameleon/test_processor_chameleon.py
+++ b/tests/models/chameleon/test_processor_chameleon.py
@@ -2,6 +2,7 @@
 import unittest
 
 from transformers import ChameleonProcessor
+from transformers.models.auto.processing_auto import processor_class_from_name
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -10,6 +11,22 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     from_pretrained_id = "leloy/Anole-7b-v0.1-hf"
     processor_class = ChameleonProcessor
 
+    def get_component(self, attribute, **kwargs):
+        assert attribute in self.processor_class.attributes
+        component_class_name = getattr(self.processor_class, f"{attribute}_class")
+        if isinstance(component_class_name, tuple):
+            if "_fast" in component_class_name[0]:
+                component_class_name = component_class_name[0]
+            else:
+                component_class_name = component_class_name[1]
+
+        component_class = processor_class_from_name(component_class_name)
+        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
+        if attribute == "tokenizer" and not component.pad_token:
+            component.pad_token = "[TEST_PAD]"
+
+        return component
+
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
         processor = self.processor_class.from_pretrained(self.from_pretrained_id)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index a30c6363b9d7ff..31f14e7294380d 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -143,7 +143,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234))
+        image_processor = self.get_component("image_processor", size=(234, 234), crop_size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
@@ -187,7 +187,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, size=[224, 224])
+        inputs = processor(text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224))
         self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
 
     @require_torch
@@ -208,6 +208,7 @@ def test_unstructured_kwargs(self):
             images=image_input,
             return_tensors="pt",
             size={"height": 214, "width": 214},
+            crop_size={"height": 214, "width": 214},
             padding="max_length",
             max_length=76,
         )
@@ -233,6 +234,7 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             size={"height": 214, "width": 214},
+            crop_size={"height": 214, "width": 214},
             padding="longest",
             max_length=76,
         )
@@ -260,6 +262,7 @@ def test_doubly_passed_kwargs(self):
                 images=image_input,
                 images_kwargs={"size": {"height": 222, "width": 222}},
                 size={"height": 214, "width": 214},
+                crop_size={"height": 214, "width": 214},
             )
 
     @require_torch

From 6592ce3d9e6de63a6ab8027f158701b437c2bda4 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Fri, 16 Aug 2024 18:34:41 +0800
Subject: [PATCH 06/25] fix chameleon tests

---
 tests/test_processing_common.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 31f14e7294380d..6a0bdd5e86349a 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -282,7 +282,10 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
+            "images_kwargs": {
+                "size": {"height": 214, "width": 214},
+                "crop_size": {"height": 214, "width": 214},
+            },
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -310,7 +313,10 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
+            "images_kwargs": {
+                "size": {"height": 214, "width": 214},
+                "crop_size": {"height": 214, "width": 214},
+            },
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 

From c4f5474b25867f486ecd8224dd4efa41aea54bdd Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Fri, 16 Aug 2024 21:59:58 +0800
Subject: [PATCH 07/25] don't hardcode arg names

---
 tests/test_processing_common.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 6a0bdd5e86349a..ec1e211872e667 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -49,6 +49,8 @@
 @require_torch
 class ProcessorTesterMixin:
     processor_class = None
+    text_data_arg_name = "input_ids"
+    images_data_arg_name = "pixel_values"
 
     def prepare_processor_dict(self):
         return {}
@@ -136,7 +138,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs["input_ids"][0]), 117)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 117)
 
     @require_torch
     @require_vision
@@ -153,7 +155,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
+        self.assertEqual(len(inputs[self.images_data_arg_name][0][0]), 234)
 
     @require_vision
     @require_torch
@@ -171,7 +173,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         inputs = processor(
             text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
         )
-        self.assertEqual(len(inputs["input_ids"][0]), 112)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 112)
 
     @require_torch
     @require_vision
@@ -188,7 +190,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224))
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
+        self.assertEqual(len(inputs[self.images_data_arg_name][0][0]), 224)
 
     @require_torch
     @require_vision
@@ -213,8 +215,8 @@ def test_unstructured_kwargs(self):
             max_length=76,
         )
 
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
 
     @require_torch
     @require_vision
@@ -239,9 +241,9 @@ def test_unstructured_kwargs_batched(self):
             max_length=76,
         )
 
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214)
 
-        self.assertEqual(len(inputs["input_ids"][0]), 6)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 6)
 
     @require_torch
     @require_vision
@@ -292,9 +294,9 @@ def test_structured_kwargs_nested(self):
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214)
 
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
 
     @require_torch
     @require_vision
@@ -321,9 +323,9 @@ def test_structured_kwargs_nested_from_dict(self):
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214)
 
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
 
 
 class MyProcessor(ProcessorMixin):

From ce9cc731d46294fa5c095f713ce3b6d9e8c91b4f Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Sat, 17 Aug 2024 14:16:08 +0800
Subject: [PATCH 08/25] uniformize processor kwargs of altclip, bridgetower,
 flava, instructblipvideo, llava_next, llava_next_video, siglip, video_llava,
 vilt

---
 .../models/altclip/processing_altclip.py      |  38 +++-
 .../image_processing_bridgetower.py           |   4 +-
 .../models/flava/processing_flava.py          |  89 +++++----
 .../processing_instructblipvideo.py           | 102 +++++-----
 .../llava_next/image_processing_llava_next.py |   6 +-
 .../processing_llava_next_video.py            |  73 ++++---
 .../models/siglip/processing_siglip.py        |  68 +++----
 .../video_llava/processing_video_llava.py     |  74 ++++----
 .../models/vilt/image_processing_vilt.py      |   8 +-
 .../models/vilt/processing_vilt.py            |  71 ++++---
 .../models/altclip/test_processor_altclip.py  |  16 ++
 tests/models/flava/test_processor_flava.py    |  10 +-
 .../test_processor_instructblipvideo.py       |  21 +++
 .../test_processor_llava_next_video.py        |  16 ++
 tests/models/siglip/test_processor_siglip.py  |  16 ++
 .../video_llava/test_processor_video_llava.py |  17 ++
 tests/models/vilt/test_processor_vilt.py      | 178 ++++++++++++++++++
 tests/test_processing_common.py               | 122 +++++++-----
 18 files changed, 614 insertions(+), 315 deletions(-)
 create mode 100644 tests/models/altclip/test_processor_altclip.py
 create mode 100644 tests/models/instructblipvideo/test_processor_instructblipvideo.py
 create mode 100644 tests/models/llava_next_video/test_processor_llava_next_video.py
 create mode 100644 tests/models/siglip/test_processor_siglip.py
 create mode 100644 tests/models/video_llava/test_processor_video_llava.py
 create mode 100644 tests/models/vilt/test_processor_vilt.py

diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 2814b2d7f26e89..af9b0aee5930a6 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -16,10 +16,23 @@
 Image/Text processor class for AltCLIP
 """
 
+import sys
 import warnings
+from typing import List, Optional, Union
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class AltCLIPProcessingKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
 
 
 class AltCLIPProcessor(ProcessorMixin):
@@ -59,7 +72,12 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
 
         super().__init__(image_processor, tokenizer)
 
-    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        images: Optional[ImageInput] = None,
+        **kwargs: Unpack[AltCLIPProcessingKwargs],
+    ):
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
@@ -97,11 +115,17 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         if text is None and images is None:
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
+        output_kwargs = self._merge_kwargs(
+            AltCLIPProcessingKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
         if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
         if text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values
@@ -109,7 +133,9 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         elif text is not None:
             return encoding
         else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+            return BatchEncoding(
+                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
+            )
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index 7272093715f882..49905d36b33518 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -115,8 +115,8 @@ def get_resize_output_image_size(
         new_width = scale * new_width
 
     new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)
-    new_height = new_height // size_divisor * size_divisor
-    new_width = new_width // size_divisor * size_divisor
+    new_height = max(1, new_height // size_divisor) * size_divisor
+    new_width = max(1, new_width // size_divisor) * size_divisor
 
     return new_height, new_width
 
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index 7f439b040a8fd0..e98df78f6034d5 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -16,13 +16,41 @@
 Image/Text processor class for FLAVA
 """
 
+import sys
 import warnings
 from typing import List, Optional, Union
 
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class FlavaImagesKwargs(ImagesKwargs, total=False):
+    return_image_mask: Optional[bool]
+    return_codebook_pixels: Optional[bool]
+
+
+class FlavaProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: FlavaImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "truncation": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_length": False,
+            "verbose": True,
+        },
+    }
 
 
 class FlavaProcessor(ProcessorMixin):
@@ -64,23 +92,7 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = False,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_image_mask: Optional[bool] = None,
-        return_codebook_pixels: Optional[bool] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
+        **kwargs: Unpack[FlavaProcessorKwargs],
     ):
         """
         This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and
@@ -92,33 +104,16 @@ def __call__(
         if text is None and images is None:
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
+        output_kwargs = self._merge_kwargs(
+            FlavaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
         if text is not None:
-            encoding = self.tokenizer(
-                text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                return_tensors=return_tensors,
-                **kwargs,
-            )
+            encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
         if images is not None:
-            image_features = self.image_processor(
-                images,
-                return_image_mask=return_image_mask,
-                return_codebook_pixels=return_codebook_pixels,
-                return_tensors=return_tensors,
-                **kwargs,
-            )
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
         if text is not None and images is not None:
             encoding.update(image_features)
@@ -126,7 +121,9 @@ def __call__(
         elif text is not None:
             return encoding
         else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+            return BatchEncoding(
+                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
+            )
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index f56f8186b07d73..c38edc2048549f 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -17,26 +17,48 @@
 """
 
 import os
+import sys
 from typing import List, Optional, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import VideoInput
-from ...processing_utils import ProcessorMixin
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import (
     AddedToken,
     BatchEncoding,
-    PaddingStrategy,
     PreTokenizedInput,
     TextInput,
-    TruncationStrategy,
 )
-from ...utils import TensorType, logging
+from ...utils import logging
 from ..auto import AutoTokenizer
 
 
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
 logger = logging.get_logger(__name__)
 
 
+class InstructBlipVideoProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "truncation": None,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": False,
+            "return_length": False,
+            "verbose": True,
+        },
+    }
+
+
 class InstructBlipVideoProcessor(ProcessorMixin):
     r"""
     Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
@@ -71,23 +93,11 @@ def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, num_query
 
     def __call__(
         self,
-        images: VideoInput = None,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_token_type_ids: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
+        images: Optional[VideoInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[InstructBlipVideoProcessorKwargs],
     ) -> BatchFeature:
         """
         This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
@@ -95,6 +105,12 @@ def __call__(
 
         Please refer to the docstring of the above two methods for more information.
         """
+        output_kwargs = self._merge_kwargs(
+            InstructBlipVideoProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
         encoding = BatchFeature()
 
         if text is not None:
@@ -105,21 +121,10 @@ def __call__(
 
             _text_encoding = self.tokenizer(
                 text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_token_type_ids=return_token_type_ids,
-                return_length=return_length,
-                verbose=verbose,
-                return_tensors=None,  # required to concatenate below
-                **kwargs,
+                **{
+                    **output_kwargs["text_kwargs"],
+                    "return_tensors": None,  # required to concatenate below
+                },
             )
 
             # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
@@ -145,31 +150,16 @@ def __call__(
                     )
 
             # cast to desired return tensors type after concatenating
-            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
-            encoding.update(text_encoding)
-            qformer_text_encoding = self.qformer_tokenizer(
-                text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_token_type_ids=return_token_type_ids,
-                return_length=return_length,
-                verbose=verbose,
-                return_tensors=return_tensors,
-                **kwargs,
+            text_encoding = BatchEncoding(
+                text_encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")
             )
+            encoding.update(text_encoding)
+            qformer_text_encoding = self.qformer_tokenizer(text=text, **output_kwargs["text_kwargs"])
             encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
             encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
 
         if images is not None:
-            image_encoding = self.image_processor(images, return_tensors=return_tensors)
+            image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"])
             encoding.update(image_encoding)
 
         return encoding
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
index f744b9fcf9c1cd..f8237d0078bf0a 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -720,7 +720,11 @@ def preprocess(
             image_patches = self.get_image_patches(
                 image,
                 image_grid_pinpoints,
-                size=(size["shortest_edge"], size["shortest_edge"]),
+                size=(
+                    (size["shortest_edge"], size["shortest_edge"])
+                    if "shortest_edge" in size
+                    else (size["height"], size["width"])
+                ),
                 patch_size=crop_size["height"],
                 resample=resample,
                 data_format=input_data_format,
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index efbb193ba62a9f..e693ce265ef1e6 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -16,13 +16,20 @@
 Processor class for LLaVa-NeXT-Video.
 """
 
+import sys
 from typing import TYPE_CHECKING, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
 
 
 if TYPE_CHECKING:
@@ -31,6 +38,17 @@
 logger = logging.get_logger(__name__)
 
 
+class LlavaNextVideoProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+        },
+    }
+
+
 class LlavaNextVideoProcessor(ProcessorMixin):
     r"""
     Constructs a LLaVa-NeXT-Video processor which wraps a LLaVa-NeXT image processor, LLaVa-NeXT-Video video processor and
@@ -88,12 +106,10 @@ def __init__(
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
-        images: ImageInput = None,
-        videos: VideoInput = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: int = None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        images: Optional[ImageInput] = None,
+        videos: Optional[VideoInput] = None,
+        audio=None,
+        **kwargs: Unpack[LlavaNextVideoProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -115,26 +131,6 @@ def __call__(
             videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                 tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -145,13 +141,19 @@ def __call__(
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+        output_kwargs = self._merge_kwargs(
+            LlavaNextVideoProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
         if images is not None:
-            image_inputs = self.image_processor(images, return_tensors=return_tensors)
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         else:
             image_inputs = {}
 
         if videos is not None:
-            videos_inputs = self.video_processor(videos, return_tensors=return_tensors)
+            videos_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
         else:
             videos_inputs = {}
 
@@ -203,14 +205,7 @@ def __call__(
                     sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
                     prompt_strings.append(sample)
 
-        text_inputs = self.tokenizer(
-            prompt_strings,
-            return_tensors=return_tensors,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-        )
-        print(text_inputs.keys())
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
 
         return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
 
diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py
index 655fb4d4f78ab0..eef71e33424d8e 100644
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@@ -16,13 +16,30 @@
 Image/Text processor class for SigLIP.
 """
 
+import sys
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class SiglipProcessingKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+        },
+    }
 
 
 class SiglipProcessor(ProcessorMixin):
@@ -48,12 +65,9 @@ def __init__(self, image_processor, tokenizer):
 
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        images: ImageInput = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: int = None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        images: Optional[ImageInput] = None,
+        **kwargs: Unpack[SiglipProcessingKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -70,26 +84,6 @@ def __call__(
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -104,13 +98,17 @@ def __call__(
         if text is None and images is None:
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
+        output_kwargs = self._merge_kwargs(
+            SiglipProcessingKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
         if text is not None:
-            encoding = self.tokenizer(
-                text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
-            )
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
         if text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values
@@ -118,7 +116,9 @@ def __call__(
         elif text is not None:
             return encoding
         else:
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+            return BatchFeature(
+                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
+            )
 
     def decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index a06913d7acf760..774c4003f3cb0f 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -16,18 +16,36 @@
 Processor class for VideoLlava.
 """
 
+import sys
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, get_image_size, to_numpy_array
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
 
 
 logger = logging.get_logger(__name__)
 
 
+class VideoLlavaProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+        },
+    }
+
+
 class VideoLlavaProcessor(ProcessorMixin):
     r"""
     Constructs a VideoLlava processor which wraps a VideoLlava image processor and a Llava tokenizer into a single processor.
@@ -77,13 +95,11 @@ def __init__(
 
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        images: ImageInput = None,
-        videos: ImageInput = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length=None,
-        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        images: Optional[ImageInput] = None,
+        videos: Optional[ImageInput] = None,
+        audio=None,
+        **kwargs: Unpack[VideoLlavaProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -105,26 +121,6 @@ def __call__(
                 Video frames to preprocess. Expects a single or batch of video frames in NumPy array or PyTorch
                 tensor. Each video should be of shape (T, C, H, W), where T is number of frames, C is
                 number of channels, H and W are image height and width.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -135,9 +131,17 @@ def __call__(
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+        output_kwargs = self._merge_kwargs(
+            VideoLlavaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # Temporary fix for "paddding_side" in init_kwargs
+        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
+
         data = {}
         if images is not None or videos is not None:
-            encoded_images = self.image_processor(images=images, videos=videos, return_tensors=return_tensors)
+            encoded_images = self.image_processor(images=images, videos=videos, **output_kwargs["images_kwargs"])
             data.update(encoded_images)
 
         if isinstance(text, str):
@@ -174,13 +178,7 @@ def __call__(
                 sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
                 prompt_strings.append(sample)
 
-        text_inputs = self.tokenizer(
-            prompt_strings,
-            return_tensors=return_tensors,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-        )
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
         data.update(text_inputs)
 
         return BatchFeature(data=data)
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index 66ffeb816fec5e..f2c3529218e257 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -112,8 +112,8 @@ def get_resize_output_image_size(
         new_width = scale * new_width
 
     new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)
-    new_height = new_height // size_divisor * size_divisor
-    new_width = new_width // size_divisor * size_divisor
+    new_height = max(1, new_height // size_divisor) * size_divisor
+    new_width = max(1, new_width // size_divisor) * size_divisor
 
     return new_height, new_width
 
@@ -236,9 +236,7 @@ def resize(
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
-        shorter = size["shortest_edge"]
+        shorter = size["shortest_edge"] if "shortest_edge" in size else min(size["height"], size["width"])
         longer = int(1333 / 800 * shorter)
         output_size = get_resize_output_image_size(
             image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 0ccb884ea00c9d..46e18b3ff6bd88 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -16,12 +16,34 @@
 Processor class for ViLT.
 """
 
+import sys
 import warnings
 from typing import List, Optional, Union
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class ViltProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_length": False,
+            "verbose": True,
+        },
+    }
 
 
 class ViltProcessor(ProcessorMixin):
@@ -63,23 +85,9 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
 
     def __call__(
         self,
-        images,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
+        images: ImageInput,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        **kwargs: Unpack[ViltProcessorKwargs],
     ) -> BatchEncoding:
         """
         This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and
@@ -87,26 +95,15 @@ def __call__(
 
         Please refer to the docstring of the above two methods for more information.
         """
-        encoding = self.tokenizer(
-            text=text,
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            return_tensors=return_tensors,
+        output_kwargs = self._merge_kwargs(
+            ViltProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+
+        encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
         # add pixel_values + pixel_mask
-        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
+        encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
         encoding.update(encoding_image_processor)
 
         return encoding
diff --git a/tests/models/altclip/test_processor_altclip.py b/tests/models/altclip/test_processor_altclip.py
new file mode 100644
index 00000000000000..86a84ae9ab8bc6
--- /dev/null
+++ b/tests/models/altclip/test_processor_altclip.py
@@ -0,0 +1,16 @@
+import tempfile
+import unittest
+
+from transformers import AltCLIPProcessor
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class AltCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "BAAI/AltCLIP"
+    processor_class = AltCLIPProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)
diff --git a/tests/models/flava/test_processor_flava.py b/tests/models/flava/test_processor_flava.py
index a83e459153d532..56a52ee21c7b07 100644
--- a/tests/models/flava/test_processor_flava.py
+++ b/tests/models/flava/test_processor_flava.py
@@ -22,16 +22,18 @@
 import numpy as np
 import pytest
 
-from transformers import BertTokenizer, BertTokenizerFast
+from transformers import BertTokenizer, BertTokenizerFast, FlavaProcessor
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import FlavaImageProcessor, FlavaProcessor
+    from transformers import FlavaImageProcessor
     from transformers.models.flava.image_processing_flava import (
         FLAVA_CODEBOOK_MEAN,
         FLAVA_CODEBOOK_STD,
@@ -41,7 +43,9 @@
 
 
 @require_vision
-class FlavaProcessorTest(unittest.TestCase):
+class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = FlavaProcessor
+
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
diff --git a/tests/models/instructblipvideo/test_processor_instructblipvideo.py b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
new file mode 100644
index 00000000000000..07c81f1f649651
--- /dev/null
+++ b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
@@ -0,0 +1,21 @@
+import tempfile
+import unittest
+
+from transformers import InstructBlipVideoProcessor
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "Salesforce/instructblip-vicuna-7b"
+    processor_class = InstructBlipVideoProcessor
+
+    def prepare_components(self):
+        components = super().prepare_components()
+        components["qformer_tokenizer"] = components["tokenizer"]
+        return components
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)
diff --git a/tests/models/llava_next_video/test_processor_llava_next_video.py b/tests/models/llava_next_video/test_processor_llava_next_video.py
new file mode 100644
index 00000000000000..9cd4615d572547
--- /dev/null
+++ b/tests/models/llava_next_video/test_processor_llava_next_video.py
@@ -0,0 +1,16 @@
+import tempfile
+import unittest
+
+from transformers import LlavaNextVideoProcessor
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
+    processor_class = LlavaNextVideoProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)
diff --git a/tests/models/siglip/test_processor_siglip.py b/tests/models/siglip/test_processor_siglip.py
new file mode 100644
index 00000000000000..608ff70539a218
--- /dev/null
+++ b/tests/models/siglip/test_processor_siglip.py
@@ -0,0 +1,16 @@
+import tempfile
+import unittest
+
+from transformers import SiglipProcessor
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class SiglipProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "google/siglip-base-patch16-224"
+    processor_class = SiglipProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)
diff --git a/tests/models/video_llava/test_processor_video_llava.py b/tests/models/video_llava/test_processor_video_llava.py
new file mode 100644
index 00000000000000..9ddc84a6bcb944
--- /dev/null
+++ b/tests/models/video_llava/test_processor_video_llava.py
@@ -0,0 +1,17 @@
+import tempfile
+import unittest
+
+from transformers.models.video_llava.processing_video_llava import VideoLlavaProcessor
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class VideoLlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "LanguageBind/Video-LLaVA-7B-hf"
+    processor_class = VideoLlavaProcessor
+    images_data_arg_name = "pixel_values_images"
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)
diff --git a/tests/models/vilt/test_processor_vilt.py b/tests/models/vilt/test_processor_vilt.py
new file mode 100644
index 00000000000000..0ae6a5256d1b32
--- /dev/null
+++ b/tests/models/vilt/test_processor_vilt.py
@@ -0,0 +1,178 @@
+import tempfile
+import unittest
+
+from transformers import ViltProcessor
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class ViltProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "dandelin/vilt-b32-mlm"
+    processor_class = ViltProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)
+
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_components["image_processor"] = self.get_component(
+            "image_processor", size=(234, 234), crop_size=(234, 234), size_divisor=32
+        )
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(**processor_components)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        # VILT resizes images to dims divisible by size_divisor
+        vilt_compatible_image_size = (32, 384)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1])
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_components["image_processor"] = self.get_component("image_processor", size=(234, 234))
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(**processor_components)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        # VILT resizes images to dims divisible by size_divisor
+        vilt_compatible_image_size = (32, 352)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(
+            text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224), return_tensors="pt"
+        )
+
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1])
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        # VILT resizes images to dims divisible by size_divisor
+        vilt_compatible_image_size = (32, 352)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {
+                "size": {"height": 214, "width": 214},
+                "crop_size": {"height": 214, "width": 214},
+            },
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1])
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        # VILT resizes images to dims divisible by size_divisor
+        vilt_compatible_image_size = (32, 352)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {
+                "size": {"height": 214, "width": 214},
+                "crop_size": {"height": 214, "width": 214},
+            },
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1])
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        # VILT resizes images to dims divisible by size_divisor
+        vilt_compatible_image_size = (32, 352)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            size={"height": 214, "width": 214},
+            crop_size={"height": 214, "width": 214},
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1])
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        # VILT resizes images to dims divisible by size_divisor
+        vilt_compatible_image_size = (32, 352)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            size={"height": 214, "width": 214},
+            crop_size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1])
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), len(inputs[self.text_data_arg_name][1]))
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index ec1e211872e667..bad0eb8cd6b72b 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -26,6 +26,7 @@
 import unittest
 
 import numpy as np
+from huggingface_hub import hf_hub_download
 
 from transformers import CLIPTokenizerFast, ProcessorMixin
 from transformers.models.auto.processing_auto import processor_class_from_name
@@ -51,6 +52,7 @@ class ProcessorTesterMixin:
     processor_class = None
     text_data_arg_name = "input_ids"
     images_data_arg_name = "pixel_values"
+    videos_data_arg_name = "pixel_values_videos"
 
     def prepare_processor_dict(self):
         return {}
@@ -90,6 +92,13 @@ def prepare_image_inputs(self):
         image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
         return image_inputs
 
+    @require_vision
+    def prepare_video_inputs(self):
+        video_file = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
+        )
+        return [np.load(video_file)]
+
     def test_processor_to_json_string(self):
         processor = self.get_processor()
         obj = json.loads(processor.to_json_string())
@@ -129,43 +138,69 @@ def skip_processor_without_typed_kwargs(self, processor):
     def test_tokenizer_defaults_preserved_by_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_components = self.prepare_components()
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
 
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 117)
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 117)
 
     @require_torch
     @require_vision
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_components["image_processor"] = self.get_component(
+            "image_processor", size=(234, 234), crop_size=(234, 234)
+        )
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(**processor_components)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 234)
+
+    @require_torch
+    @require_vision
+    def test_video_processor_defaults_preserved_by_kwargs(self):
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", size=(234, 234), crop_size=(234, 234))
+        video_processor = self.get_component("video_processor", size=(234, 234), crop_size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
 
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor = self.processor_class(
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            video_processor=video_processor,
+        )
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
+        video_input = self.prepare_video_inputs()
 
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs[self.images_data_arg_name][0][0]), 234)
+        inputs = processor(text=input_str, images=image_input, videos=video_input, return_tensors="pt")
+        self.assertEqual(inputs[self.videos_data_arg_name].shape[-1], 234)
 
     @require_vision
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", padding="longest")
+        processor_components = self.prepare_components()
+        processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
 
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
@@ -173,34 +208,35 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         inputs = processor(
             text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
         )
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 112)
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 112)
 
     @require_torch
     @require_vision
     def test_kwargs_overrides_default_image_processor_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_components = self.prepare_components()
+        processor_components["image_processor"] = self.get_component("image_processor", size=(234, 234))
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
 
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224))
-        self.assertEqual(len(inputs[self.images_data_arg_name][0][0]), 224)
+        inputs = processor(
+            text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224), return_tensors="pt"
+        )
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 224)
 
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
@@ -215,18 +251,16 @@ def test_unstructured_kwargs(self):
             max_length=76,
         )
 
-        self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214)
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
 
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = ["lower newer", "upper older longer string"]
@@ -241,19 +275,16 @@ def test_unstructured_kwargs_batched(self):
             max_length=76,
         )
 
-        self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214)
-
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 6)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), len(inputs[self.text_data_arg_name][1]))
 
     @require_torch
     @require_vision
     def test_doubly_passed_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = ["lower newer"]
@@ -265,6 +296,7 @@ def test_doubly_passed_kwargs(self):
                 images_kwargs={"size": {"height": 222, "width": 222}},
                 size={"height": 214, "width": 214},
                 crop_size={"height": 214, "width": 214},
+                return_tensors="pt",
             )
 
     @require_torch
@@ -272,10 +304,8 @@ def test_doubly_passed_kwargs(self):
     def test_structured_kwargs_nested(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
@@ -294,21 +324,18 @@ def test_structured_kwargs_nested(self):
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
-        self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214)
-
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
 
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
+
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
@@ -323,9 +350,8 @@ def test_structured_kwargs_nested_from_dict(self):
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214)
-
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
 
 
 class MyProcessor(ProcessorMixin):

From d3259142deb471abab6e394283d75f9cadf53ccf Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Sat, 17 Aug 2024 14:26:05 +0800
Subject: [PATCH 09/25] fix linter issue

---
 .../models/bridgetower/image_processing_bridgetower.py        | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index 49905d36b33518..b9d0d41377bfde 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -238,9 +238,7 @@ def resize(
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
         size = get_size_dict(size, default_to_square=False)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
-        shorter = size["shortest_edge"]
+        shorter = size["shortest_edge"] if "shortest_edge" in size else min(size["height"], size["width"])
         longer = int(1333 / 800 * shorter)
         output_size = get_resize_output_image_size(
             image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format

From 935d6e51d470861484f0fd2ec1ea5e9a6982a6d2 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Mon, 19 Aug 2024 16:43:00 +0800
Subject: [PATCH 10/25] address @zucchini-nlp's comments

---
 .../models/altclip/processing_altclip.py      | 16 ++--------
 .../models/chameleon/processing_chameleon.py  | 10 +++---
 .../models/flava/processing_flava.py          |  8 ++---
 .../processing_instructblipvideo.py           | 32 +++++++++++++------
 .../processing_llava_next_video.py            | 10 +++---
 .../models/siglip/processing_siglip.py        |  6 ++--
 .../video_llava/processing_video_llava.py     |  6 ++--
 .../models/vilt/processing_vilt.py            | 26 +++++++++++++--
 8 files changed, 69 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index af9b0aee5930a6..63168c7dd205bc 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -22,7 +22,7 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
 if sys.version_info >= (3, 11):
@@ -94,16 +94,8 @@ def __call__(
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
 
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
         Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
@@ -133,9 +125,7 @@ def __call__(
         elif text is not None:
             return encoding
         else:
-            return BatchEncoding(
-                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
-            )
+            return image_features
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index d999267ef1fa9c..d9fbe965034e59 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -114,7 +114,7 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise TypeError("Invalid input text. Please provide a string, or a list of strings")
         if text is None and images is None:
-            raise ValueError("You must provide either text or images")
+            raise ValueError("You must provide either text or images as prompt")
 
         output_kwargs = self._merge_kwargs(
             ChameleonProcessorKwargs,
@@ -132,12 +132,10 @@ def __call__(
                 sample += self.tokenizer.sep_token  # special Chameleon treatment to add sep for chat mode
             prompt_strings.append(sample)
 
-        data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
-
+        features = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
         if images is not None:
-            data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
-
-        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
+            features["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+        return features
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index e98df78f6034d5..b62fc77fc38746 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -22,7 +22,7 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
 if sys.version_info >= (3, 11):
@@ -92,6 +92,8 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
         **kwargs: Unpack[FlavaProcessorKwargs],
     ):
         """
@@ -121,9 +123,7 @@ def __call__(
         elif text is not None:
             return encoding
         else:
-            return BatchEncoding(
-                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
-            )
+            return image_features
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index c38edc2048549f..58f2fc565a7fd6 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -20,15 +20,10 @@
 import sys
 from typing import List, Optional, Union
 
-from ...image_processing_utils import BatchFeature
+from ...feature_extraction_utils import BatchFeature
 from ...image_utils import VideoInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import (
-    AddedToken,
-    BatchEncoding,
-    PreTokenizedInput,
-    TextInput,
-)
+from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
 from ...utils import logging
 from ..auto import AutoTokenizer
 
@@ -103,7 +98,26 @@ def __call__(
         This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
-        Please refer to the docstring of the above two methods for more information.
+        Args:
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            -- **qformer_input_ids** - List of token ids from the Q-Former tokenizer to be fed to a model. Returned when `text` is not `None`.
+            -- **qformer_attention_mask** - List of indices specifying which tokens from the Q-Former tokenizer should be attended to by the model. Returned when `text` is not `None`.
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             InstructBlipVideoProcessorKwargs,
@@ -150,7 +164,7 @@ def __call__(
                     )
 
             # cast to desired return tensors type after concatenating
-            text_encoding = BatchEncoding(
+            text_encoding = BatchFeature(
                 text_encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")
             )
             encoding.update(text_encoding)
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index e693ce265ef1e6..d11512ae7ff8f8 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -139,7 +139,8 @@ def __call__(
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
               `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_images** -- Pixel values of images to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             LlavaNextVideoProcessorKwargs,
@@ -162,8 +163,6 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        print(self.patch_size, self.vision_feature_select_strategy, image_inputs, videos_inputs.keys())
-
         if self.patch_size is None or self.vision_feature_select_strategy is None:
             prompt_strings = text
             logger.warning_once(
@@ -207,7 +206,10 @@ def __call__(
 
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
 
-        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+        return BatchFeature(
+            data={**text_inputs, **image_inputs, **videos_inputs},
+            tensor_type=output_kwargs["common_kwargs"].get("return_tensors"),
+        )
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):
diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py
index eef71e33424d8e..cd7391f7bff395 100644
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@@ -67,6 +67,8 @@ def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
         images: Optional[ImageInput] = None,
+        audio=None,
+        videos=None,
         **kwargs: Unpack[SiglipProcessingKwargs],
     ) -> BatchFeature:
         """
@@ -116,9 +118,7 @@ def __call__(
         elif text is not None:
             return encoding
         else:
-            return BatchFeature(
-                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
-            )
+            return image_features
 
     def decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 774c4003f3cb0f..61ebfbc40831de 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -20,7 +20,7 @@
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
@@ -97,7 +97,7 @@ def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
         images: Optional[ImageInput] = None,
-        videos: Optional[ImageInput] = None,
+        videos: Optional[VideoInput] = None,
         audio=None,
         **kwargs: Unpack[VideoLlavaProcessorKwargs],
     ) -> BatchFeature:
@@ -181,7 +181,7 @@ def __call__(
         text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
         data.update(text_inputs)
 
-        return BatchFeature(data=data)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 46e18b3ff6bd88..91f466e1079e1a 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -20,9 +20,10 @@
 import warnings
 from typing import List, Optional, Union
 
+from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
 if sys.version_info >= (3, 11):
@@ -87,13 +88,32 @@ def __call__(
         self,
         images: ImageInput,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
         **kwargs: Unpack[ViltProcessorKwargs],
-    ) -> BatchEncoding:
+    ) -> BatchFeature:
         """
         This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
-        Please refer to the docstring of the above two methods for more information.
+        Args:
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             ViltProcessorKwargs,

From 39650f60c7b5d4808dd54aa6acbd38faee61bf0f Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Mon, 19 Aug 2024 16:57:04 +0800
Subject: [PATCH 11/25] improve docs

---
 .../models/altclip/processing_altclip.py      |  6 ++++--
 .../models/chameleon/processing_chameleon.py  |  4 ++--
 .../models/flava/processing_flava.py          | 19 ++++++++++++++++++-
 .../processing_instructblipvideo.py           |  3 ++-
 .../processing_llava_next_video.py            |  6 +++---
 .../models/siglip/processing_siglip.py        |  4 ++--
 .../video_llava/processing_video_llava.py     |  6 +++---
 .../models/vilt/processing_vilt.py            |  2 +-
 8 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 63168c7dd205bc..c15b18d029c206 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -76,6 +76,8 @@ def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
         images: Optional[ImageInput] = None,
+        audio=None,
+        videos=None,
         **kwargs: Unpack[AltCLIPProcessingKwargs],
     ):
         """
@@ -86,11 +88,11 @@ def __call__(
         of the above two methods for more information.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            images (`ImageInput`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
 
diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index d9fbe965034e59..14d759ec6dcf87 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -92,11 +92,11 @@ def __call__(
         of the above two methods for more information.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            images (`ImageInput`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
 
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index b62fc77fc38746..c3a3fe12046660 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -100,7 +100,24 @@ def __call__(
         This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
-        Please refer to the docstring of the above two methods for more information.
+        Args:
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`ImageInput`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
 
         if text is None and images is None:
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 58f2fc565a7fd6..1855dd4db4e58e 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -103,7 +103,7 @@ def __call__(
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            images (`ImageInput`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
@@ -119,6 +119,7 @@ def __call__(
             -- **qformer_attention_mask** - List of indices specifying which tokens from the Q-Former tokenizer should be attended to by the model. Returned when `text` is not `None`.
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+
         output_kwargs = self._merge_kwargs(
             InstructBlipVideoProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index d11512ae7ff8f8..ad85ff8d15e266 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -121,14 +121,14 @@ def __call__(
         of the above two methods for more information.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            images (`ImageInput`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
-            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            videos (`VideoInput`, *optional*):
                 The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                 tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
 
diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py
index cd7391f7bff395..265832d840daca 100644
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@@ -79,11 +79,11 @@ def __call__(
         of the above two methods for more information.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            images (`ImageInput`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
 
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 61ebfbc40831de..49c103fabe9729 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -109,15 +109,15 @@ def __call__(
         of the above two methods for more information.
 
         Args:
-            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            images (`ImageInput`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
-            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            videos (`VideoInput`, *optional*):
                 Video frames to preprocess. Expects a single or batch of video frames in NumPy array or PyTorch
                 tensor. Each video should be of shape (T, C, H, W), where T is number of frames, C is
                 number of channels, H and W are image height and width.
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 91f466e1079e1a..a282ebe2c9e3da 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -101,7 +101,7 @@ def __call__(
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            images (`ImageInput`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.

From 539da9dde154645558c2daaafc7bf83f01fbd2c8 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Mon, 19 Aug 2024 17:05:53 +0800
Subject: [PATCH 12/25] don't dw from hub for video tests

---
 tests/test_processing_common.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index bad0eb8cd6b72b..ec77e1fed7fa70 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -26,7 +26,6 @@
 import unittest
 
 import numpy as np
-from huggingface_hub import hf_hub_download
 
 from transformers import CLIPTokenizerFast, ProcessorMixin
 from transformers.models.auto.processing_auto import processor_class_from_name
@@ -94,10 +93,7 @@ def prepare_image_inputs(self):
 
     @require_vision
     def prepare_video_inputs(self):
-        video_file = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
-        )
-        return [np.load(video_file)]
+        return [np.random.randint(255, size=(4, 3, 30, 400), dtype=np.uint8)]
 
     def test_processor_to_json_string(self):
         processor = self.get_processor()

From c8b2384b00de01eaaf34a1f7d0afa68cae623ed1 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Mon, 19 Aug 2024 17:30:42 +0800
Subject: [PATCH 13/25] add video processing tests for instructblipvideo &
 video_llava

---
 .../processing_instructblipvideo.py           | 27 ++++++++++++++-----
 .../test_processor_instructblipvideo.py       |  1 +
 tests/test_processing_common.py               | 25 +++++++++--------
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 1855dd4db4e58e..2a60f7ab3ef07a 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -18,6 +18,7 @@
 
 import os
 import sys
+import warnings
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
@@ -88,10 +89,10 @@ def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, num_query
 
     def __call__(
         self,
-        images: Optional[VideoInput] = None,
+        images: Optional[VideoInput] = None,  # Keeping this here for backwards compatibility
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        videos: Optional[VideoInput] = None,
         audio=None,
-        videos=None,
         **kwargs: Unpack[InstructBlipVideoProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -107,6 +108,9 @@ def __call__(
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
+            videos (`VideoInput`, *optional*):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -119,6 +123,17 @@ def __call__(
             -- **qformer_attention_mask** - List of indices specifying which tokens from the Q-Former tokenizer should be attended to by the model. Returned when `text` is not `None`.
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+        if images is not None:
+            warnings.warn(
+                "The `images` argument is deprecated and will be removed in future versions, use `videos` instead.",
+                FutureWarning,
+            )
+        if images is not None and videos is not None:
+            raise ValueError(
+                "You cannot provide both `images` and `videos` at the same time. Please pass video data as `videos=...` instead."
+            )
+        if images is not None and videos is None:
+            videos = images
 
         output_kwargs = self._merge_kwargs(
             InstructBlipVideoProcessorKwargs,
@@ -144,7 +159,7 @@ def __call__(
 
             # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
             # because BLIP expects image tokens to be at the beginning even before BOS token
-            if self.num_query_tokens is not None and images is not None:
+            if self.num_query_tokens is not None and videos is not None:
                 text_encoding = {}
                 video_tokens = (
                     self.video_token.content * self.num_query_tokens * 4
@@ -157,7 +172,7 @@ def __call__(
                     ]
             else:
                 text_encoding = _text_encoding
-                if images is not None:
+                if videos is not None:
                     logger.warning_once(
                         "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
                         "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
@@ -173,8 +188,8 @@ def __call__(
             encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
             encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
 
-        if images is not None:
-            image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"])
+        if videos is not None:
+            image_encoding = self.image_processor(videos, **output_kwargs["images_kwargs"])
             encoding.update(image_encoding)
 
         return encoding
diff --git a/tests/models/instructblipvideo/test_processor_instructblipvideo.py b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
index 07c81f1f649651..9442a429944226 100644
--- a/tests/models/instructblipvideo/test_processor_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_processor_instructblipvideo.py
@@ -9,6 +9,7 @@
 class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     from_pretrained_id = "Salesforce/instructblip-vicuna-7b"
     processor_class = InstructBlipVideoProcessor
+    videos_data_arg_name = "pixel_values"
 
     def prepare_components(self):
         components = super().prepare_components()
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index ec77e1fed7fa70..ebb5e6f74f3d07 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -168,24 +168,27 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
     @require_torch
     @require_vision
     def test_video_processor_defaults_preserved_by_kwargs(self):
-        if "video_processor" not in self.processor_class.attributes:
+        if "video_processor" not in self.processor_class.attributes and (
+            "videos" not in inspect.signature(self.processor_class.__call__).parameters
+            or inspect.signature(self.processor_class.__call__).parameters["videos"].annotation == inspect._empty
+        ):
             self.skipTest(f"video_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234), crop_size=(234, 234))
-        video_processor = self.get_component("video_processor", size=(234, 234), crop_size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer,
-            image_processor=image_processor,
-            video_processor=video_processor,
+        processor_components = self.prepare_components()
+        processor_components["image_processor"] = self.get_component(
+            "image_processor", size=(234, 234), crop_size=(234, 234)
         )
+        if "video_processor" in self.processor_class.attributes:
+            processor_components["video_processor"] = self.get_component(
+                "video_processor", size=(234, 234), crop_size=(234, 234)
+            )
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
         video_input = self.prepare_video_inputs()
 
-        inputs = processor(text=input_str, images=image_input, videos=video_input, return_tensors="pt")
+        inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
         self.assertEqual(inputs[self.videos_data_arg_name].shape[-1], 234)
 
     @require_vision

From 423d8645b1583102e986f76503fdbbb7d9a3b2fb Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Mon, 19 Aug 2024 21:08:50 +0800
Subject: [PATCH 14/25] add git, mgp, tvp, & x-clip

---
 .../models/altclip/processing_altclip.py      | 17 ++--
 .../models/flava/processing_flava.py          | 13 +--
 src/transformers/models/git/processing_git.py | 66 ++++++++------
 .../models/mgp_str/processing_mgp_str.py      | 87 +++++++++++++++----
 .../models/siglip/processing_siglip.py        | 16 ++--
 .../models/tvp/image_processing_tvp.py        | 16 ++++
 src/transformers/models/tvp/processing_tvp.py | 74 ++++++++++------
 .../models/vilt/processing_vilt.py            |  3 +-
 .../models/x_clip/processing_x_clip.py        | 55 ++++++++----
 tests/models/git/test_processor_git.py        |  6 +-
 .../models/mgp_str/test_processor_mgp_str.py  | 22 ++---
 tests/models/tvp/test_processor_tvp.py        | 81 +++++++++++++++++
 tests/models/x_clip/test_processor_x_clip.py  | 54 ++++++++++++
 13 files changed, 387 insertions(+), 123 deletions(-)
 create mode 100644 tests/models/tvp/test_processor_tvp.py
 create mode 100644 tests/models/x_clip/test_processor_x_clip.py

diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index c15b18d029c206..9d01f96afac5e3 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -20,6 +20,7 @@
 import warnings
 from typing import List, Optional, Union
 
+from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
@@ -31,7 +32,7 @@
     from typing_extensions import Unpack
 
 
-class AltCLIPProcessingKwargs(ProcessingKwargs, total=False):
+class AltCLIPProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
@@ -78,8 +79,8 @@ def __call__(
         images: Optional[ImageInput] = None,
         audio=None,
         videos=None,
-        **kwargs: Unpack[AltCLIPProcessingKwargs],
-    ):
+        **kwargs: Unpack[AltCLIPProcessorKwargs],
+    ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
@@ -110,7 +111,7 @@ def __call__(
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
         output_kwargs = self._merge_kwargs(
-            AltCLIPProcessingKwargs,
+            AltCLIPProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
@@ -121,13 +122,13 @@ def __call__(
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
+        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
         if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
+            return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors)
         elif text is not None:
-            return encoding
+            return BatchFeature(data=dict(**encoding), tensor_type=return_tensors)
         else:
-            return image_features
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index c3a3fe12046660..d1da077a185043 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -20,6 +20,7 @@
 import warnings
 from typing import List, Optional, Union
 
+from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
@@ -95,13 +96,13 @@ def __call__(
         audio=None,
         videos=None,
         **kwargs: Unpack[FlavaProcessorKwargs],
-    ):
+    ) -> BatchFeature:
         """
         This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
         Args:
-            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
@@ -134,13 +135,13 @@ def __call__(
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
+        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
         if text is not None and images is not None:
-            encoding.update(image_features)
-            return encoding
+            return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors)
         elif text is not None:
-            return encoding
+            return BatchFeature(data=dict(**encoding), tensor_type=return_tensors)
         else:
-            return image_features
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py
index 98649c644e728c..3d1166e3b06d97 100644
--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@@ -16,8 +16,23 @@
 Image/Text processor class for GIT
 """
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+import sys
+from typing import List, Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class GitProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
 
 
 class GitProcessor(ProcessorMixin):
@@ -42,7 +57,14 @@ def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
         self.current_processor = self.image_processor
 
-    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        images: Optional[ImageInput] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[GitProcessorKwargs],
+    ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -51,24 +73,16 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         of the above two methods for more information.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            images (`ImageInput`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
 
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
         Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
@@ -76,29 +90,29 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
-        tokenizer_kwargs, image_processor_kwargs = {}, {}
-        if kwargs:
-            tokenizer_kwargs = {k: v for k, v in kwargs.items() if k not in self.image_processor._valid_processor_keys}
-            image_processor_kwargs = {
-                k: v for k, v in kwargs.items() if k in self.image_processor._valid_processor_keys
-            }
+
+        output_kwargs = self._merge_kwargs(
+            GitProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
 
         if text is None and images is None:
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
         if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **tokenizer_kwargs)
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **image_processor_kwargs)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
+        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
         if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
+            return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors)
         elif text is not None:
-            return encoding
+            return BatchFeature(data=dict(**encoding), tensor_type=return_tensors)
         else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 207d4230ba09b7..7e30a0336b809f 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -14,13 +14,24 @@
 # limitations under the License.
 """Processor class for MGP-STR."""
 
+import sys
 import warnings
+from typing import List, Optional, Union
 
 from transformers import AutoTokenizer
-from transformers.utils import is_torch_available
-from transformers.utils.generic import ExplicitEnum
 
-from ...processing_utils import ProcessorMixin
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.generic import ExplicitEnum
+from ...utils.import_utils import is_torch_available
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
 
 
 if is_torch_available():
@@ -36,6 +47,10 @@ class DecodeType(ExplicitEnum):
 SUPPORTED_ANNOTATION_FORMATS = (DecodeType.CHARACTER, DecodeType.BPE, DecodeType.WORDPIECE)
 
 
+class MgpstrProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
+
+
 class MgpstrProcessor(ProcessorMixin):
     r"""
     Constructs a MGP-STR processor which wraps an image processor and MGP-STR tokenizers into a single
@@ -50,9 +65,9 @@ class MgpstrProcessor(ProcessorMixin):
             The tokenizer is a required input.
     """
 
-    attributes = ["image_processor", "char_tokenizer"]
+    attributes = ["image_processor", "tokenizer"]
     image_processor_class = "ViTImageProcessor"
-    char_tokenizer_class = "MgpstrTokenizer"
+    tokenizer_class = "MgpstrTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         feature_extractor = None
@@ -63,41 +78,81 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
                 FutureWarning,
             )
             feature_extractor = kwargs.pop("feature_extractor")
+        if "char_tokenizer" in kwargs:
+            warnings.warn(
+                "The `char_tokenizer` argument is deprecated and will be removed in future versions, use `tokenizer`"
+                " instead.",
+                FutureWarning,
+            )
+            char_tokenizer = kwargs.pop("char_tokenizer")
 
         image_processor = image_processor if image_processor is not None else feature_extractor
+        tokenizer = tokenizer if tokenizer is not None else char_tokenizer
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
             raise ValueError("You need to specify a `tokenizer`.")
 
-        self.char_tokenizer = tokenizer
+        self.tokenizer = tokenizer
+        self.char_tokenizer = tokenizer  # For backwards compatibility
         self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
         self.wp_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
         super().__init__(image_processor, tokenizer)
 
-    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        images: Optional[ImageInput] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[MgpstrProcessorKwargs],
+    ) -> BatchFeature:
         """
         When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
         [`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
         arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
         refer to the doctsring of the above methods for more information.
+
+        Args:
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`ImageInput`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **labels** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
+        output_kwargs = self._merge_kwargs(
+            MgpstrProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
         if images is not None:
-            inputs = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
         if text is not None:
-            encodings = self.char_tokenizer(text, return_tensors=return_tensors, **kwargs)
+            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
-        if text is None:
-            return inputs
-        elif images is None:
-            return encodings
+        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
+        if text is not None and images is not None:
+            return BatchFeature(data=dict(**image_features, labels=encodings["input_ids"]), tensor_type=return_tensors)
+        elif text is not None:
+            return BatchFeature(data=dict(**encodings), tensor_type=return_tensors)
         else:
-            inputs["labels"] = encodings["input_ids"]
-            return inputs
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
 
     def batch_decode(self, sequences):
         """
@@ -201,7 +256,7 @@ def char_decode(self, sequences):
         Returns:
             `List[str]`: The list of char decoded sentences.
         """
-        decode_strs = [seq.replace(" ", "") for seq in self.char_tokenizer.batch_decode(sequences)]
+        decode_strs = [seq.replace(" ", "") for seq in self.tokenizer.batch_decode(sequences)]
         return decode_strs
 
     def bpe_decode(self, sequences):
diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py
index 265832d840daca..ac3f659d3efaf9 100644
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@@ -31,7 +31,7 @@
     from typing_extensions import Unpack
 
 
-class SiglipProcessingKwargs(ProcessingKwargs, total=False):
+class SiglipProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {
         "text_kwargs": {
             "padding": False,
@@ -69,7 +69,7 @@ def __call__(
         images: Optional[ImageInput] = None,
         audio=None,
         videos=None,
-        **kwargs: Unpack[SiglipProcessingKwargs],
+        **kwargs: Unpack[SiglipProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -101,7 +101,7 @@ def __call__(
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
         output_kwargs = self._merge_kwargs(
-            SiglipProcessingKwargs,
+            SiglipProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
@@ -112,13 +112,15 @@ def __call__(
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
+        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
         if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
+            return BatchFeature(
+                data=dict(**encoding, pixel_values=image_features.pixel_values), tensor_type=return_tensors
+            )
         elif text is not None:
-            return encoding
+            return BatchFeature(data=dict(**encoding), tensor_type=return_tensors)
         else:
-            return image_features
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
 
     def decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 100ec133e8b026..4e9618eef17084 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -305,20 +305,30 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
+        print(f"{image.shape = }")
+
         if do_resize:
             image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
 
+        print(f"{image.shape = }")
+
         if do_center_crop:
             image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
 
+        print(f"{image.shape = }")
+
         if do_rescale:
             image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
 
+        print(f"{image.shape = }")
+
         if do_normalize:
             image = self.normalize(
                 image=image.astype(np.float32), mean=image_mean, std=image_std, input_data_format=input_data_format
             )
 
+        print(f"{image.shape = }")
+
         if do_pad:
             image = self.pad_image(
                 image=image,
@@ -328,12 +338,18 @@ def _preprocess_image(
                 input_data_format=input_data_format,
             )
 
+        print(f"{image.shape = }")
+
         # the pretrained checkpoints assume images are BGR, not RGB
         if do_flip_channel_order:
             image = flip_channel_order(image=image, input_data_format=input_data_format)
 
+        print(f"{image.shape = }")
+
         image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
 
+        print(f"{image.shape = }")
+
         return image
 
     @filter_out_non_signature_kwargs()
diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py
index eb8aabfdade3ed..96a85a984d8f84 100644
--- a/src/transformers/models/tvp/processing_tvp.py
+++ b/src/transformers/models/tvp/processing_tvp.py
@@ -16,8 +16,35 @@
 Processor class for TVP.
 """
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+import sys
+from typing import List, Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import VideoInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class TvpTextKwargs(TextKwargs, total=False):
+    pad_to_max_length: bool
+
+
+class TvpProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: TvpTextKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": "max_length",
+            "truncation": True,
+            "pad_to_max_length": True,
+            "return_token_type_ids": False,
+        },
+    }
 
 
 class TvpProcessor(ProcessorMixin):
@@ -46,7 +73,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
 
         super().__init__(image_processor, tokenizer)
 
-    def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        videos: Optional[VideoInput] = None,
+        images=None,
+        audio=None,
+        **kwargs: Unpack[TvpProcessorKwargs],
+    ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -65,16 +99,8 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
                 each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
                 channels.
 
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
         Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
@@ -83,30 +109,28 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
         """
 
-        max_text_length = kwargs.pop("max_text_length", None)
+        if "max_text_length" in kwargs:
+            kwargs["max_length"] = kwargs.pop("max_text_length")
 
         if text is None and videos is None:
             raise ValueError("You have to specify either text or videos. Both cannot be none.")
 
+        output_kwargs = self._merge_kwargs(
+            TvpProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
         encoding = {}
         if text is not None:
-            textual_input = self.tokenizer.batch_encode_plus(
-                text,
-                truncation=True,
-                padding="max_length",
-                max_length=max_text_length,
-                pad_to_max_length=True,
-                return_tensors=return_tensors,
-                return_token_type_ids=False,
-                **kwargs,
-            )
+            textual_input = self.tokenizer(text, **output_kwargs["text_kwargs"])
             encoding.update(textual_input)
 
         if videos is not None:
-            image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(videos, **output_kwargs["videos_kwargs"])
             encoding.update(image_features)
 
-        return BatchEncoding(data=encoding, tensor_type=return_tensors)
+        return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index a282ebe2c9e3da..be8729e297a875 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -114,6 +114,7 @@ def __call__(
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_mask** -- Mask for the pixel values. Returned when `images` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             ViltProcessorKwargs,
@@ -126,7 +127,7 @@ def __call__(
         encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
         encoding.update(encoding_image_processor)
 
-        return encoding
+        return BatchFeature(data=dict(**encoding), tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index a11aeb18dc4f59..fa2cc860fd5d05 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -16,10 +16,24 @@
 Image/Text processor class for XCLIP
 """
 
+import sys
 import warnings
+from typing import List, Optional, Union
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import VideoInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class XCLIPProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
 
 
 class XCLIPProcessor(ProcessorMixin):
@@ -59,7 +73,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
         self.current_processor = self.image_processor
 
-    def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        videos: Optional[VideoInput] = None,
+        images=None,
+        audio=None,
+        **kwargs: Unpack[XCLIPProcessorKwargs],
+    ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -78,16 +99,8 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
                 each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
                 channels.
 
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
         Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
@@ -99,19 +112,25 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
         if text is None and videos is None:
             raise ValueError("You have to specify either text or videos. Both cannot be none.")
 
+        output_kwargs = self._merge_kwargs(
+            XCLIPProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
         if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if videos is not None:
-            image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(videos, **output_kwargs["videos_kwargs"])
 
+        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
         if text is not None and videos is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
+            return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors)
         elif text is not None:
-            return encoding
+            return BatchFeature(data=dict(**encoding), tensor_type=return_tensors)
         else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/tests/models/git/test_processor_git.py b/tests/models/git/test_processor_git.py
index 95e436d8e4f526..d66260bc57483a 100644
--- a/tests/models/git/test_processor_git.py
+++ b/tests/models/git/test_processor_git.py
@@ -21,6 +21,8 @@
 from transformers.testing_utils import require_vision
 from transformers.utils import is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
@@ -29,7 +31,9 @@
 
 
 @require_vision
-class GitProcessorTest(unittest.TestCase):
+class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = GitProcessor
+
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
diff --git a/tests/models/mgp_str/test_processor_mgp_str.py b/tests/models/mgp_str/test_processor_mgp_str.py
index 6a028a28424d61..a5322aa5d31435 100644
--- a/tests/models/mgp_str/test_processor_mgp_str.py
+++ b/tests/models/mgp_str/test_processor_mgp_str.py
@@ -20,29 +20,30 @@
 import tempfile
 import unittest
 
-import numpy as np
 import pytest
 
-from transformers import MgpstrTokenizer
+from transformers import MgpstrProcessor, MgpstrTokenizer
 from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_torch_available():
     import torch
 
 
 if is_vision_available():
-    from PIL import Image
-
-    from transformers import MgpstrProcessor, ViTImageProcessor
+    from transformers import ViTImageProcessor
 
 
 @require_torch
 @require_vision
-class MgpstrProcessorTest(unittest.TestCase):
+class MgpstrProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = MgpstrProcessor
     image_processing_class = ViTImageProcessor if is_vision_available() else None
+    text_data_arg_name = "labels"
 
     @property
     def image_processor_dict(self):
@@ -79,15 +80,6 @@ def get_image_processor(self, **kwargs):
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
-    def prepare_image_inputs(self):
-        """This function prepares a list of PIL images."""
-
-        image_input = np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)
-
-        image_input = Image.fromarray(np.moveaxis(image_input, 0, -1))
-
-        return image_input
-
     def test_save_load_pretrained_default(self):
         tokenizer = self.get_tokenizer()
         image_processor = self.get_image_processor()
diff --git a/tests/models/tvp/test_processor_tvp.py b/tests/models/tvp/test_processor_tvp.py
new file mode 100644
index 00000000000000..8f5e0bd6b5d05d
--- /dev/null
+++ b/tests/models/tvp/test_processor_tvp.py
@@ -0,0 +1,81 @@
+import inspect
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import TvpProcessor
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class TvpProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "Jiqing/tiny-random-tvp"
+    processor_class = TvpProcessor
+    videos_data_arg_name = "pixel_values"
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)
+
+    @require_vision
+    def prepare_video_inputs(self):
+        return [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+    @require_torch
+    @require_vision
+    def test_video_processor_defaults_preserved_by_kwargs(self):
+        if "video_processor" not in self.processor_class.attributes and (
+            "videos" not in inspect.signature(self.processor_class.__call__).parameters
+            or inspect.signature(self.processor_class.__call__).parameters["videos"].annotation == inspect._empty
+        ):
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        processor_components = self.prepare_components()
+        processor_components["image_processor"] = self.get_component(
+            "image_processor", size=(234, 234), crop_size=(234, 234), do_pad=False
+        )
+        if "video_processor" in self.processor_class.attributes:
+            processor_components["video_processor"] = self.get_component(
+                "video_processor", size=(234, 234), crop_size=(234, 234), do_pad=False
+            )
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor = self.processor_class(**processor_components)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        video_input = self.prepare_video_inputs()
+
+        inputs = processor(text=input_str, videos=video_input, return_tensors="pt")
+        self.assertEqual(inputs[self.videos_data_arg_name].shape[-1], 234)
+
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        self.skipTest("TVP does not process images")
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        self.skipTest("TVP does not process images")
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        self.skipTest("TVP does not process images")
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        self.skipTest("TVP does not process images")
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        self.skipTest("TVP does not process images")
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        self.skipTest("TVP does not process images")
diff --git a/tests/models/x_clip/test_processor_x_clip.py b/tests/models/x_clip/test_processor_x_clip.py
new file mode 100644
index 00000000000000..e9d0bf4b2539ee
--- /dev/null
+++ b/tests/models/x_clip/test_processor_x_clip.py
@@ -0,0 +1,54 @@
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import XCLIPProcessor
+from transformers.testing_utils import require_torch, require_vision
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class XCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "microsoft/xclip-base-patch32"
+    processor_class = XCLIPProcessor
+    videos_data_arg_name = "pixel_values"
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)
+
+    @require_vision
+    def prepare_video_inputs(self):
+        return [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
+
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        self.skipTest("XCLIP does not process images")
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        self.skipTest("XCLIP does not process images")
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        self.skipTest("XCLIP does not process images")
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        self.skipTest("XCLIP does not process images")
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        self.skipTest("XCLIP does not process images")
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        self.skipTest("XCLIP does not process images")

From 5fd2c32673d2faf69f70e0412fbbd4e0996335c8 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Mon, 19 Aug 2024 21:20:19 +0800
Subject: [PATCH 15/25] fix docs

---
 src/transformers/models/flava/processing_flava.py      |  8 ++++----
 .../instructblipvideo/processing_instructblipvideo.py  |  9 +++++----
 src/transformers/models/vilt/processing_vilt.py        | 10 +++++-----
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index d1da077a185043..f40e76698032c6 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -102,14 +102,14 @@ def __call__(
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
         Args:
-            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             images (`ImageInput`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 2a60f7ab3ef07a..6e6ee8eb865aba 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -100,14 +100,15 @@ def __call__(
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
         Args:
-            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             images (`ImageInput`, *optional*):
+                NOTE: Use `videos` instead. We only left this here for backwards compatibility.
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             videos (`VideoInput`, *optional*):
                 The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                 tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index be8729e297a875..c5d22502096d31 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -97,14 +97,14 @@ def __call__(
         [`BertTokenizerFast.__call__`] to prepare text for the model.
 
         Args:
-            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`ImageInput`, *optional*):
+            images (`ImageInput`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
+            text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:

From 9e00f6875e8b90c45434b4a073c6cf78b0a3ffac Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Tue, 20 Aug 2024 18:21:17 +0800
Subject: [PATCH 16/25] address @zucchini-nlp's comments

---
 .../models/mgp_str/processing_mgp_str.py      | 25 ++++++++++++-------
 .../models/tvp/image_processing_tvp.py        | 24 ++++++------------
 .../videomae/image_processing_videomae.py     |  9 ++++++-
 .../models/vivit/image_processing_vivit.py    |  8 +++++-
 tests/models/tvp/test_processor_tvp.py        |  6 -----
 tests/models/x_clip/test_processor_x_clip.py  |  6 -----
 tests/test_processing_common.py               |  2 +-
 7 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 7e30a0336b809f..169d8adcec7b8a 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -78,28 +78,35 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
                 FutureWarning,
             )
             feature_extractor = kwargs.pop("feature_extractor")
-        if "char_tokenizer" in kwargs:
-            warnings.warn(
-                "The `char_tokenizer` argument is deprecated and will be removed in future versions, use `tokenizer`"
-                " instead.",
-                FutureWarning,
-            )
-            char_tokenizer = kwargs.pop("char_tokenizer")
 
         image_processor = image_processor if image_processor is not None else feature_extractor
-        tokenizer = tokenizer if tokenizer is not None else char_tokenizer
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
             raise ValueError("You need to specify a `tokenizer`.")
 
         self.tokenizer = tokenizer
-        self.char_tokenizer = tokenizer  # For backwards compatibility
         self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
         self.wp_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
 
         super().__init__(image_processor, tokenizer)
 
+    @property
+    def char_tokenizer(self):
+        warnings.warn(
+            "The `char_tokenizer` attribute is deprecated and will be removed in future versions, use `tokenizer` instead.",
+            FutureWarning,
+        )
+        return self.tokenizer
+
+    @char_tokenizer.setter
+    def char_tokenizer(self, value):
+        warnings.warn(
+            "The `char_tokenizer` attribute is deprecated and will be removed in future versions, use `tokenizer` instead.",
+            FutureWarning,
+        )
+        self.tokenizer = value
+
     def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 4e9618eef17084..7a4c5db004671e 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -50,7 +50,13 @@
 
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched
 def make_batched(videos) -> List[List[ImageInput]]:
-    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+    if isinstance(videos, np.ndarray) and videos.ndim == 5:
+        return videos
+
+    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
+        return [videos]
+
+    elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
         return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
@@ -305,30 +311,20 @@ def _preprocess_image(
         # All transformations expect numpy arrays.
         image = to_numpy_array(image)
 
-        print(f"{image.shape = }")
-
         if do_resize:
             image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
 
-        print(f"{image.shape = }")
-
         if do_center_crop:
             image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
 
-        print(f"{image.shape = }")
-
         if do_rescale:
             image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
 
-        print(f"{image.shape = }")
-
         if do_normalize:
             image = self.normalize(
                 image=image.astype(np.float32), mean=image_mean, std=image_std, input_data_format=input_data_format
             )
 
-        print(f"{image.shape = }")
-
         if do_pad:
             image = self.pad_image(
                 image=image,
@@ -338,18 +334,12 @@ def _preprocess_image(
                 input_data_format=input_data_format,
             )
 
-        print(f"{image.shape = }")
-
         # the pretrained checkpoints assume images are BGR, not RGB
         if do_flip_channel_order:
             image = flip_channel_order(image=image, input_data_format=input_data_format)
 
-        print(f"{image.shape = }")
-
         image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
 
-        print(f"{image.shape = }")
-
         return image
 
     @filter_out_non_signature_kwargs()
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index 413589523aa675..c21210faf6670c 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -47,8 +47,15 @@
 logger = logging.get_logger(__name__)
 
 
+# Copied from transformers.models.vivit.image_processing_vivit.make_batched
 def make_batched(videos) -> List[List[ImageInput]]:
-    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+    if isinstance(videos, np.ndarray) and videos.ndim == 5:
+        return videos
+
+    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
+        return [videos]
+
+    elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
         return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 5f251bbd1b95b9..fb959e9f1eddb2 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -51,7 +51,13 @@
 
 
 def make_batched(videos) -> List[List[ImageInput]]:
-    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+    if isinstance(videos, np.ndarray) and videos.ndim == 5:
+        return videos
+
+    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
+        return [videos]
+
+    elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
         return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
diff --git a/tests/models/tvp/test_processor_tvp.py b/tests/models/tvp/test_processor_tvp.py
index 8f5e0bd6b5d05d..40d700e0beea15 100644
--- a/tests/models/tvp/test_processor_tvp.py
+++ b/tests/models/tvp/test_processor_tvp.py
@@ -2,8 +2,6 @@
 import tempfile
 import unittest
 
-import numpy as np
-
 from transformers import TvpProcessor
 from transformers.testing_utils import require_torch, require_vision
 
@@ -20,10 +18,6 @@ def setUp(self):
         processor = self.processor_class.from_pretrained(self.from_pretrained_id)
         processor.save_pretrained(self.tmpdirname)
 
-    @require_vision
-    def prepare_video_inputs(self):
-        return [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
     @require_torch
     @require_vision
     def test_video_processor_defaults_preserved_by_kwargs(self):
diff --git a/tests/models/x_clip/test_processor_x_clip.py b/tests/models/x_clip/test_processor_x_clip.py
index e9d0bf4b2539ee..5b34855a67252a 100644
--- a/tests/models/x_clip/test_processor_x_clip.py
+++ b/tests/models/x_clip/test_processor_x_clip.py
@@ -1,8 +1,6 @@
 import tempfile
 import unittest
 
-import numpy as np
-
 from transformers import XCLIPProcessor
 from transformers.testing_utils import require_torch, require_vision
 
@@ -19,10 +17,6 @@ def setUp(self):
         processor = self.processor_class.from_pretrained(self.from_pretrained_id)
         processor.save_pretrained(self.tmpdirname)
 
-    @require_vision
-    def prepare_video_inputs(self):
-        return [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)]
-
     @require_torch
     @require_vision
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index ebb5e6f74f3d07..53cfcf5520c053 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -93,7 +93,7 @@ def prepare_image_inputs(self):
 
     @require_vision
     def prepare_video_inputs(self):
-        return [np.random.randint(255, size=(4, 3, 30, 400), dtype=np.uint8)]
+        return np.random.randint(255, size=(1, 4, 3, 30, 400), dtype=np.uint8)
 
     def test_processor_to_json_string(self):
         processor = self.get_processor()

From a2672a6dc10f7b5d2ebed3e4733b0116763f2a62 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Tue, 20 Aug 2024 18:32:25 +0800
Subject: [PATCH 17/25] simplify implementations

---
 .../models/altclip/processing_altclip.py      | 15 +++++--------
 .../models/flava/processing_flava.py          | 14 +++++--------
 src/transformers/models/git/processing_git.py | 21 +++++++------------
 .../models/mgp_str/processing_mgp_str.py      | 17 +++++++--------
 .../models/siglip/processing_siglip.py        | 17 +++++----------
 src/transformers/models/tvp/processing_tvp.py | 14 ++++++-------
 .../models/vilt/processing_vilt.py            | 14 +++++++------
 .../models/x_clip/processing_x_clip.py        | 17 ++++++---------
 8 files changed, 50 insertions(+), 79 deletions(-)

diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 9d01f96afac5e3..51ea3032053c3d 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -116,19 +116,14 @@ def __call__(
             **kwargs,
         )
 
+        data = {}
         if text is not None:
-            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
+            text_features = self.tokenizer(text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-
-        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
-        if text is not None and images is not None:
-            return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors)
-        elif text is not None:
-            return BatchFeature(data=dict(**encoding), tensor_type=return_tensors)
-        else:
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+            data.update(image_features)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index f40e76698032c6..ace0434d0bd2f7 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -130,18 +130,14 @@ def __call__(
             **kwargs,
         )
 
+        data = {}
         if text is not None:
-            encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-
-        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
-        if text is not None and images is not None:
-            return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors)
-        elif text is not None:
-            return BatchFeature(data=dict(**encoding), tensor_type=return_tensors)
-        else:
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+            data.update(image_features)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py
index 3d1166e3b06d97..5abb1990233ac9 100644
--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@@ -91,28 +91,23 @@ def __call__(
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
 
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
         output_kwargs = self._merge_kwargs(
             GitProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
 
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
-
+        data = {}
         if text is not None:
-            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
+            text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-
-        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
-        if text is not None and images is not None:
-            return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors)
-        elif text is not None:
-            return BatchFeature(data=dict(**encoding), tensor_type=return_tensors)
-        else:
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+            data.update(image_features)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 169d8adcec7b8a..11ff8653779301 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -139,6 +139,7 @@ def __call__(
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
@@ -148,18 +149,14 @@ def __call__(
             **kwargs,
         )
 
+        data = {}
+        if text is not None:
+            text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-        if text is not None:
-            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
-        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
-        if text is not None and images is not None:
-            return BatchFeature(data=dict(**image_features, labels=encodings["input_ids"]), tensor_type=return_tensors)
-        elif text is not None:
-            return BatchFeature(data=dict(**encodings), tensor_type=return_tensors)
-        else:
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+            data.update(image_features)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, sequences):
         """
diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py
index ac3f659d3efaf9..f8f3e8f9eaff49 100644
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@@ -106,21 +106,14 @@ def __call__(
             **kwargs,
         )
 
+        data = {}
         if text is not None:
-            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
+            text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-
-        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
-        if text is not None and images is not None:
-            return BatchFeature(
-                data=dict(**encoding, pixel_values=image_features.pixel_values), tensor_type=return_tensors
-            )
-        elif text is not None:
-            return BatchFeature(data=dict(**encoding), tensor_type=return_tensors)
-        else:
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+            data.update(image_features)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py
index 96a85a984d8f84..7ce29d9e9e1a53 100644
--- a/src/transformers/models/tvp/processing_tvp.py
+++ b/src/transformers/models/tvp/processing_tvp.py
@@ -121,16 +121,14 @@ def __call__(
             **kwargs,
         )
 
-        encoding = {}
+        data = {}
         if text is not None:
-            textual_input = self.tokenizer(text, **output_kwargs["text_kwargs"])
-            encoding.update(textual_input)
-
+            text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
         if videos is not None:
-            image_features = self.image_processor(videos, **output_kwargs["videos_kwargs"])
-            encoding.update(image_features)
-
-        return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
+            video_features = self.image_processor(videos, **output_kwargs["videos_kwargs"])
+            data.update(video_features)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index c5d22502096d31..562e5a3f94a955 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -122,12 +122,14 @@ def __call__(
             **kwargs,
         )
 
-        encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
-        # add pixel_values + pixel_mask
-        encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
-        encoding.update(encoding_image_processor)
-
-        return BatchFeature(data=dict(**encoding), tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
+        data = {}
+        if text is not None:
+            text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
+        if images is not None:
+            images_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+            data.update(images_features)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index fa2cc860fd5d05..f722ef37d498a8 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -118,19 +118,14 @@ def __call__(
             **kwargs,
         )
 
+        data = {}
         if text is not None:
-            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
+            text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
         if videos is not None:
-            image_features = self.image_processor(videos, **output_kwargs["videos_kwargs"])
-
-        return_tensors = output_kwargs["common_kwargs"].get("return_tensors")
-        if text is not None and videos is not None:
-            return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors)
-        elif text is not None:
-            return BatchFeature(data=dict(**encoding), tensor_type=return_tensors)
-        else:
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+            video_features = self.image_processor(videos, **output_kwargs["images_kwargs"])
+            data.update(video_features)
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
         """

From 721d1c81944564d1c30b83bd286ea72721a29523 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Tue, 20 Aug 2024 18:52:01 +0800
Subject: [PATCH 18/25] uniformize implementations of make_batched_videos and
 make_batched_images

---
 .../chameleon/image_processing_chameleon.py   |  3 ++-
 .../image_processing_instructblipvideo.py     | 21 ++++++++++++-------
 .../llava_next/image_processing_llava_next.py |  2 +-
 .../image_processing_llava_next_video.py      | 21 ++++++++++++-------
 .../models/tvp/image_processing_tvp.py        | 14 ++++++++-----
 .../image_processing_video_llava.py           | 21 ++++++++++++-------
 .../videomae/image_processing_videomae.py     | 14 ++++++++-----
 .../models/vivit/image_processing_vivit.py    | 12 +++++++----
 8 files changed, 71 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/chameleon/image_processing_chameleon.py b/src/transformers/models/chameleon/image_processing_chameleon.py
index a23fdbed028867..2b0bd0024f3be1 100644
--- a/src/transformers/models/chameleon/image_processing_chameleon.py
+++ b/src/transformers/models/chameleon/image_processing_chameleon.py
@@ -44,7 +44,8 @@
     import PIL
 
 
-def make_batched_images(images) -> List[List[ImageInput]]:
+# Copied from transformers.models.llava_next.image_processing_llava_next.make_batched_images
+def make_batched_images(images) -> List[ImageInput]:
     """
     Accepts images in list or nested list format, and makes a list of images for preprocessing.
 
diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
index 131b8fe57bd665..cf9074fe1bbecd 100644
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -47,18 +47,25 @@
 logger = logging.get_logger(__name__)
 
 
+# Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
-    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+    if isinstance(videos, np.ndarray) and videos.ndim == 5:
         return videos
 
+    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
+        return [videos]
+
+    elif isinstance(videos, (list, tuple)):
+        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+            return videos
+        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+            return videos
+
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], PIL.Image.Image):
-            return [videos]
-        elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
+        return [videos]
 
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
+    elif is_valid_image(videos):
+        return [[videos]]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
index f8237d0078bf0a..c5a0eaa63739c2 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -53,7 +53,7 @@
     from PIL import Image
 
 
-def make_batched_images(images) -> List[List[ImageInput]]:
+def make_batched_images(images) -> List[ImageInput]:
     """
     Accepts images in list or nested list format, and makes a list of images for preprocessing.
 
diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index e16e71875bb2c8..9991d3eb6d1afd 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -49,18 +49,25 @@
     from PIL import Image
 
 
+# Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
-    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+    if isinstance(videos, np.ndarray) and videos.ndim == 5:
         return videos
 
+    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
+        return [videos]
+
+    elif isinstance(videos, (list, tuple)):
+        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+            return videos
+        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+            return videos
+
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], Image.Image):
-            return [videos]
-        elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
+        return [videos]
 
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
+    elif is_valid_image(videos):
+        return [[videos]]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 7a4c5db004671e..07a657e1afb229 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -32,6 +32,7 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
+    VideoInput,
     get_image_size,
     is_valid_image,
     to_numpy_array,
@@ -48,16 +49,19 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.vivit.image_processing_vivit.make_batched
-def make_batched(videos) -> List[List[ImageInput]]:
+# Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, np.ndarray) and videos.ndim == 5:
         return videos
 
     elif isinstance(videos, np.ndarray) and videos.ndim == 4:
         return [videos]
 
-    elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
+    elif isinstance(videos, (list, tuple)):
+        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+            return videos
+        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+            return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
         return [videos]
@@ -449,7 +453,7 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        videos = make_batched(videos)
+        videos = make_batched_videos(videos)
 
         videos = [
             np.array(
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
index 3e77110c7d45a8..412c0499bc36e5 100644
--- a/src/transformers/models/video_llava/image_processing_video_llava.py
+++ b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -50,18 +50,25 @@
     import PIL
 
 
+# Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
-    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+    if isinstance(videos, np.ndarray) and videos.ndim == 5:
         return videos
 
+    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
+        return [videos]
+
+    elif isinstance(videos, (list, tuple)):
+        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+            return videos
+        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+            return videos
+
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], PIL.Image.Image):
-            return [videos]
-        elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
+        return [videos]
 
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
+    elif is_valid_image(videos):
+        return [[videos]]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index c21210faf6670c..3914fd867dfb65 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -30,6 +30,7 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
+    VideoInput,
     infer_channel_dimension_format,
     is_scaled_image,
     is_valid_image,
@@ -47,16 +48,19 @@
 logger = logging.get_logger(__name__)
 
 
-# Copied from transformers.models.vivit.image_processing_vivit.make_batched
-def make_batched(videos) -> List[List[ImageInput]]:
+# Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, np.ndarray) and videos.ndim == 5:
         return videos
 
     elif isinstance(videos, np.ndarray) and videos.ndim == 4:
         return [videos]
 
-    elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
+    elif isinstance(videos, (list, tuple)):
+        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+            return videos
+        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+            return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
         return [videos]
@@ -324,7 +328,7 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        videos = make_batched(videos)
+        videos = make_batched_videos(videos)
 
         videos = [
             [
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index fb959e9f1eddb2..171805e52229c7 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -34,6 +34,7 @@
     ChannelDimension,
     ImageInput,
     PILImageResampling,
+    VideoInput,
     infer_channel_dimension_format,
     is_scaled_image,
     is_valid_image,
@@ -50,15 +51,18 @@
 logger = logging.get_logger(__name__)
 
 
-def make_batched(videos) -> List[List[ImageInput]]:
+def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, np.ndarray) and videos.ndim == 5:
         return videos
 
     elif isinstance(videos, np.ndarray) and videos.ndim == 4:
         return [videos]
 
-    elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
+    elif isinstance(videos, (list, tuple)):
+        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+            return videos
+        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+            return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
         return [videos]
@@ -381,7 +385,7 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        videos = make_batched(videos)
+        videos = make_batched_videos(videos)
 
         videos = [
             [

From c0f3abb60760ee2934f5bfe33b6a05ab7e96290b Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Tue, 20 Aug 2024 19:07:26 +0800
Subject: [PATCH 19/25] fix instructblipvideo tests

---
 .../instructblipvideo/image_processing_instructblipvideo.py  | 5 ++---
 .../llava_next_video/image_processing_llava_next_video.py    | 5 ++---
 src/transformers/models/tvp/image_processing_tvp.py          | 5 ++---
 .../models/video_llava/image_processing_video_llava.py       | 5 ++---
 .../models/videomae/image_processing_videomae.py             | 5 ++---
 5 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
index cf9074fe1bbecd..827d1bfc0bbb8d 100644
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -60,9 +60,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
             return videos
         if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        return [videos]
+        if is_valid_image(videos[0]):
+            return [videos]
 
     elif is_valid_image(videos):
         return [[videos]]
diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index 9991d3eb6d1afd..3a53f222f5f226 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -62,9 +62,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
             return videos
         if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        return [videos]
+        if is_valid_image(videos[0]):
+            return [videos]
 
     elif is_valid_image(videos):
         return [[videos]]
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 07a657e1afb229..96bdf9855f6666 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -62,9 +62,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
             return videos
         if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        return [videos]
+        if is_valid_image(videos[0]):
+            return [videos]
 
     elif is_valid_image(videos):
         return [[videos]]
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
index 412c0499bc36e5..04d24be50eb38c 100644
--- a/src/transformers/models/video_llava/image_processing_video_llava.py
+++ b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -63,9 +63,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
             return videos
         if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        return [videos]
+        if is_valid_image(videos[0]):
+            return [videos]
 
     elif is_valid_image(videos):
         return [[videos]]
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index 3914fd867dfb65..12c86bdfd6e7f7 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -61,9 +61,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
             return videos
         if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        return [videos]
+        if is_valid_image(videos[0]):
+            return [videos]
 
     elif is_valid_image(videos):
         return [[videos]]

From bb5debdf49e555143d3fc1ca55db8288f45fc5c2 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Tue, 20 Aug 2024 19:12:22 +0800
Subject: [PATCH 20/25] fix copies

---
 .../image_processing_instructblipvideo.py                  | 4 ++--
 .../llava_next_video/image_processing_llava_next_video.py  | 4 ++--
 src/transformers/models/tvp/image_processing_tvp.py        | 4 ++--
 .../models/video_llava/image_processing_video_llava.py     | 4 ++--
 .../models/videomae/image_processing_videomae.py           | 4 ++--
 src/transformers/models/vivit/image_processing_vivit.py    | 7 +++----
 6 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
index 827d1bfc0bbb8d..506e2c8ef9521f 100644
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -58,9 +58,9 @@ def make_batched_videos(videos) -> List[VideoInput]:
     elif isinstance(videos, (list, tuple)):
         if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
             return videos
-        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-        if is_valid_image(videos[0]):
+        elif is_valid_image(videos[0]):
             return [videos]
 
     elif is_valid_image(videos):
diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index 3a53f222f5f226..1efd9d6e6af3e3 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -60,9 +60,9 @@ def make_batched_videos(videos) -> List[VideoInput]:
     elif isinstance(videos, (list, tuple)):
         if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
             return videos
-        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-        if is_valid_image(videos[0]):
+        elif is_valid_image(videos[0]):
             return [videos]
 
     elif is_valid_image(videos):
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 96bdf9855f6666..175b0df07f68fd 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -60,9 +60,9 @@ def make_batched_videos(videos) -> List[VideoInput]:
     elif isinstance(videos, (list, tuple)):
         if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
             return videos
-        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-        if is_valid_image(videos[0]):
+        elif is_valid_image(videos[0]):
             return [videos]
 
     elif is_valid_image(videos):
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
index 04d24be50eb38c..87a54f79744a41 100644
--- a/src/transformers/models/video_llava/image_processing_video_llava.py
+++ b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -61,9 +61,9 @@ def make_batched_videos(videos) -> List[VideoInput]:
     elif isinstance(videos, (list, tuple)):
         if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
             return videos
-        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-        if is_valid_image(videos[0]):
+        elif is_valid_image(videos[0]):
             return [videos]
 
     elif is_valid_image(videos):
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index 12c86bdfd6e7f7..0de895b51fbdea 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -59,9 +59,9 @@ def make_batched_videos(videos) -> List[VideoInput]:
     elif isinstance(videos, (list, tuple)):
         if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
             return videos
-        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-        if is_valid_image(videos[0]):
+        elif is_valid_image(videos[0]):
             return [videos]
 
     elif is_valid_image(videos):
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 171805e52229c7..06e70662f270ce 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -61,11 +61,10 @@ def make_batched_videos(videos) -> List[VideoInput]:
     elif isinstance(videos, (list, tuple)):
         if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
             return videos
-        if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
+        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
             return videos
-
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        return [videos]
+        elif is_valid_image(videos[0]):
+            return [videos]
 
     elif is_valid_image(videos):
         return [[videos]]

From d9bc2e924eb16045a3cb3960c877f0a65000ca6c Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Tue, 20 Aug 2024 20:01:34 +0800
Subject: [PATCH 21/25] fix make_batched_videos

---
 .../image_processing_instructblipvideo.py     | 19 +++++++-----------
 .../image_processing_llava_next_video.py      | 19 +++++++-----------
 .../models/tvp/image_processing_tvp.py        | 19 +++++++-----------
 .../image_processing_video_llava.py           | 20 +++++++------------
 .../videomae/image_processing_videomae.py     | 19 +++++++-----------
 .../models/vivit/image_processing_vivit.py    | 19 +++++++-----------
 6 files changed, 42 insertions(+), 73 deletions(-)

diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
index 506e2c8ef9521f..e8a7a75120160f 100644
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -49,22 +49,17 @@
 
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
-    if isinstance(videos, np.ndarray) and videos.ndim == 5:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
         return videos
 
-    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
-        return [videos]
-
-    elif isinstance(videos, (list, tuple)):
-        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-            return videos
-        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
-            return videos
-        elif is_valid_image(videos[0]):
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], PIL.Image.Image):
             return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
 
-    elif is_valid_image(videos):
-        return [[videos]]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index 1efd9d6e6af3e3..3196974855550a 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -51,22 +51,17 @@
 
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
-    if isinstance(videos, np.ndarray) and videos.ndim == 5:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
         return videos
 
-    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
-        return [videos]
-
-    elif isinstance(videos, (list, tuple)):
-        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-            return videos
-        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
-            return videos
-        elif is_valid_image(videos[0]):
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], PIL.Image.Image):
             return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
 
-    elif is_valid_image(videos):
-        return [[videos]]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 175b0df07f68fd..8b1a737e998ae7 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -51,22 +51,17 @@
 
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
-    if isinstance(videos, np.ndarray) and videos.ndim == 5:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
         return videos
 
-    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
-        return [videos]
-
-    elif isinstance(videos, (list, tuple)):
-        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-            return videos
-        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
-            return videos
-        elif is_valid_image(videos[0]):
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], PIL.Image.Image):
             return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
 
-    elif is_valid_image(videos):
-        return [[videos]]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
index 87a54f79744a41..85dc131c6a5db2 100644
--- a/src/transformers/models/video_llava/image_processing_video_llava.py
+++ b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -34,7 +34,6 @@
     VideoInput,
     infer_channel_dimension_format,
     is_scaled_image,
-    is_valid_image,
     make_list_of_images,
     to_numpy_array,
     valid_images,
@@ -52,22 +51,17 @@
 
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
-    if isinstance(videos, np.ndarray) and videos.ndim == 5:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
         return videos
 
-    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
-        return [videos]
-
-    elif isinstance(videos, (list, tuple)):
-        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-            return videos
-        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
-            return videos
-        elif is_valid_image(videos[0]):
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], PIL.Image.Image):
             return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
 
-    elif is_valid_image(videos):
-        return [[videos]]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index 0de895b51fbdea..8df92b3516fc0f 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -50,22 +50,17 @@
 
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
-    if isinstance(videos, np.ndarray) and videos.ndim == 5:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
         return videos
 
-    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
-        return [videos]
-
-    elif isinstance(videos, (list, tuple)):
-        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-            return videos
-        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
-            return videos
-        elif is_valid_image(videos[0]):
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], PIL.Image.Image):
             return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
 
-    elif is_valid_image(videos):
-        return [[videos]]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 06e70662f270ce..0cca6305f2a1cc 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -52,22 +52,17 @@
 
 
 def make_batched_videos(videos) -> List[VideoInput]:
-    if isinstance(videos, np.ndarray) and videos.ndim == 5:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
         return videos
 
-    elif isinstance(videos, np.ndarray) and videos.ndim == 4:
-        return [videos]
-
-    elif isinstance(videos, (list, tuple)):
-        if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-            return videos
-        elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4:
-            return videos
-        elif is_valid_image(videos[0]):
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], PIL.Image.Image):
             return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
 
-    elif is_valid_image(videos):
-        return [[videos]]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
 
     raise ValueError(f"Could not make batched video from {videos}")
 

From f6e7914aa091befe0e44490f00a4ace1bb899370 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Tue, 20 Aug 2024 20:07:17 +0800
Subject: [PATCH 22/25] fix MGP-str

---
 .../llava_next_video/image_processing_llava_next_video.py     | 4 ++--
 src/transformers/models/mgp_str/processing_mgp_str.py         | 4 ++++
 .../models/video_llava/image_processing_video_llava.py        | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index 3196974855550a..705c6adc42a536 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -46,7 +46,7 @@
 
 
 if is_vision_available():
-    from PIL import Image
+    import PIL
 
 
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
@@ -213,7 +213,7 @@ def _preprocess(
         do_convert_rgb: bool = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ) -> Image.Image:
+    ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
 
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 11ff8653779301..d194017669c460 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -156,6 +156,10 @@ def __call__(
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
             data.update(image_features)
+            # TODO: remove this after standardizing the outputs of vision-language processors
+            if "input_ids" in data:
+                data["labels"] = data["input_ids"]
+                data.pop("input_ids")
         return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, sequences):
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
index 85dc131c6a5db2..befc4c017260eb 100644
--- a/src/transformers/models/video_llava/image_processing_video_llava.py
+++ b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -34,6 +34,7 @@
     VideoInput,
     infer_channel_dimension_format,
     is_scaled_image,
+    is_valid_image,
     make_list_of_images,
     to_numpy_array,
     valid_images,

From acd2c562ec270b6c0e50a5f48b3637ebd5d403cd Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Tue, 20 Aug 2024 23:27:33 +0800
Subject: [PATCH 23/25] fix make_batched_videos

---
 .../image_processing_instructblipvideo.py                | 9 ++++++---
 .../image_processing_llava_next_video.py                 | 9 ++++++---
 src/transformers/models/tvp/image_processing_tvp.py      | 9 ++++++---
 .../models/video_llava/image_processing_video_llava.py   | 9 ++++++---
 .../models/videomae/image_processing_videomae.py         | 9 ++++++---
 src/transformers/models/vivit/image_processing_vivit.py  | 9 ++++++---
 6 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
index e8a7a75120160f..cd163b370ebd20 100644
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -56,10 +56,13 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
+            return [videos]
 
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
+    elif is_valid_image(videos):
+        if len(videos.shape) == 5:
+            return videos
+        elif len(videos.shape) == 4:
+            return [videos]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index 705c6adc42a536..4c96486e413435 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -58,10 +58,13 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
+            return [videos]
 
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
+    elif is_valid_image(videos):
+        if len(videos.shape) == 5:
+            return videos
+        elif len(videos.shape) == 4:
+            return [videos]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 8b1a737e998ae7..d7f80cbaed1c2a 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -58,10 +58,13 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
+            return [videos]
 
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
+    elif is_valid_image(videos):
+        if len(videos.shape) == 5:
+            return videos
+        elif len(videos.shape) == 4:
+            return [videos]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
index befc4c017260eb..a96704b48503e6 100644
--- a/src/transformers/models/video_llava/image_processing_video_llava.py
+++ b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -59,10 +59,13 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
+            return [videos]
 
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
+    elif is_valid_image(videos):
+        if len(videos.shape) == 5:
+            return videos
+        elif len(videos.shape) == 4:
+            return [videos]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index 8df92b3516fc0f..be0ed4baf88ec2 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -57,10 +57,13 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
+            return [videos]
 
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
+    elif is_valid_image(videos):
+        if len(videos.shape) == 5:
+            return videos
+        elif len(videos.shape) == 4:
+            return [videos]
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 0cca6305f2a1cc..99587a6a27753b 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -59,10 +59,13 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
+            return [videos]
 
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
+    elif is_valid_image(videos):
+        if len(videos.shape) == 5:
+            return videos
+        elif len(videos.shape) == 4:
+            return [videos]
 
     raise ValueError(f"Could not make batched video from {videos}")
 

From 5c39f4f19a091c9cd648fd1376e9b374a8fdad95 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Tue, 20 Aug 2024 23:44:17 +0800
Subject: [PATCH 24/25] fix make_batched_videos

---
 .../instructblipvideo/image_processing_instructblipvideo.py     | 2 ++
 .../llava_next_video/image_processing_llava_next_video.py       | 2 ++
 src/transformers/models/tvp/image_processing_tvp.py             | 2 ++
 .../models/video_llava/image_processing_video_llava.py          | 2 ++
 src/transformers/models/videomae/image_processing_videomae.py   | 2 ++
 src/transformers/models/vivit/image_processing_vivit.py         | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
index cd163b370ebd20..a686806a97451b 100644
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -56,6 +56,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
+            return videos
+        elif len(videos[0].shape) == 3:
             return [videos]
 
     elif is_valid_image(videos):
diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index 4c96486e413435..9ca191644302ef 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -58,6 +58,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
+            return videos
+        elif len(videos[0].shape) == 3:
             return [videos]
 
     elif is_valid_image(videos):
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index d7f80cbaed1c2a..8f05b3e966f1c8 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -58,6 +58,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
+            return videos
+        elif len(videos[0].shape) == 3:
             return [videos]
 
     elif is_valid_image(videos):
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
index a96704b48503e6..321e36eb50bc12 100644
--- a/src/transformers/models/video_llava/image_processing_video_llava.py
+++ b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -59,6 +59,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
+            return videos
+        elif len(videos[0].shape) == 3:
             return [videos]
 
     elif is_valid_image(videos):
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index be0ed4baf88ec2..628c9cb5cd66b9 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -57,6 +57,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
+            return videos
+        elif len(videos[0].shape) == 3:
             return [videos]
 
     elif is_valid_image(videos):
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 99587a6a27753b..51d646922feba9 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -59,6 +59,8 @@ def make_batched_videos(videos) -> List[VideoInput]:
         if isinstance(videos[0], PIL.Image.Image):
             return [videos]
         elif len(videos[0].shape) == 4:
+            return videos
+        elif len(videos[0].shape) == 3:
             return [videos]
 
     elif is_valid_image(videos):

From ea06e458c011c1e0b4f343056a9ffe92494b518a Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Wed, 21 Aug 2024 00:22:28 +0800
Subject: [PATCH 25/25] fix make_batched_videos

---
 .../image_processing_instructblipvideo.py           | 13 +++++++------
 .../image_processing_llava_next_video.py            | 13 +++++++------
 .../models/mgp_str/processing_mgp_str.py            | 13 +++++++++----
 src/transformers/models/tvp/image_processing_tvp.py | 13 +++++++------
 .../video_llava/image_processing_video_llava.py     | 13 +++++++------
 .../models/videomae/image_processing_videomae.py    | 13 +++++++------
 .../models/vivit/image_processing_vivit.py          | 13 +++++++------
 7 files changed, 51 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
index a686806a97451b..093aab0c4d2cb6 100644
--- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py
@@ -50,21 +50,22 @@
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
+        if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3:
+            return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], PIL.Image.Image):
+        if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3:
             return [videos]
         elif len(videos[0].shape) == 4:
             return videos
-        elif len(videos[0].shape) == 3:
-            return [videos]
 
     elif is_valid_image(videos):
-        if len(videos.shape) == 5:
-            return videos
+        if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3:
+            return [[videos]]
         elif len(videos.shape) == 4:
             return [videos]
+        elif len(videos.shape) == 5:
+            return videos
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
index 9ca191644302ef..1000d8de635699 100644
--- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -52,21 +52,22 @@
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
+        if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3:
+            return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], PIL.Image.Image):
+        if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3:
             return [videos]
         elif len(videos[0].shape) == 4:
             return videos
-        elif len(videos[0].shape) == 3:
-            return [videos]
 
     elif is_valid_image(videos):
-        if len(videos.shape) == 5:
-            return videos
+        if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3:
+            return [[videos]]
         elif len(videos.shape) == 4:
             return [videos]
+        elif len(videos.shape) == 5:
+            return videos
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index d194017669c460..4a3bdba95ad829 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -155,11 +155,16 @@ def __call__(
             data.update(text_features)
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
-            data.update(image_features)
-            # TODO: remove this after standardizing the outputs of vision-language processors
             if "input_ids" in data:
-                data["labels"] = data["input_ids"]
-                data.pop("input_ids")
+                # For backwards compatibility. MGP-STR doesn't actually use the labels, but the tests do.
+                # And users also expect the labels--and only the labels--to be returned.
+                # This requirement, however, may be relaxed in future versions.
+                data = {
+                    "pixel_values": image_features["pixel_values"],
+                    "labels": data["input_ids"],
+                }
+            else:
+                data.update(image_features)
         return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, sequences):
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 8f05b3e966f1c8..60588d213477f3 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -52,21 +52,22 @@
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
+        if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3:
+            return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], PIL.Image.Image):
+        if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3:
             return [videos]
         elif len(videos[0].shape) == 4:
             return videos
-        elif len(videos[0].shape) == 3:
-            return [videos]
 
     elif is_valid_image(videos):
-        if len(videos.shape) == 5:
-            return videos
+        if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3:
+            return [[videos]]
         elif len(videos.shape) == 4:
             return [videos]
+        elif len(videos.shape) == 5:
+            return videos
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
index 321e36eb50bc12..2472b9bdd85417 100644
--- a/src/transformers/models/video_llava/image_processing_video_llava.py
+++ b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -53,21 +53,22 @@
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
+        if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3:
+            return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], PIL.Image.Image):
+        if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3:
             return [videos]
         elif len(videos[0].shape) == 4:
             return videos
-        elif len(videos[0].shape) == 3:
-            return [videos]
 
     elif is_valid_image(videos):
-        if len(videos.shape) == 5:
-            return videos
+        if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3:
+            return [[videos]]
         elif len(videos.shape) == 4:
             return [videos]
+        elif len(videos.shape) == 5:
+            return videos
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index 628c9cb5cd66b9..7355e356196ca4 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -51,21 +51,22 @@
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
+        if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3:
+            return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], PIL.Image.Image):
+        if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3:
             return [videos]
         elif len(videos[0].shape) == 4:
             return videos
-        elif len(videos[0].shape) == 3:
-            return [videos]
 
     elif is_valid_image(videos):
-        if len(videos.shape) == 5:
-            return videos
+        if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3:
+            return [[videos]]
         elif len(videos.shape) == 4:
             return [videos]
+        elif len(videos.shape) == 5:
+            return videos
 
     raise ValueError(f"Could not make batched video from {videos}")
 
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 51d646922feba9..b50b09089f5114 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -53,21 +53,22 @@
 
 def make_batched_videos(videos) -> List[VideoInput]:
     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
+        if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3:
+            return videos
 
     elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], PIL.Image.Image):
+        if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3:
             return [videos]
         elif len(videos[0].shape) == 4:
             return videos
-        elif len(videos[0].shape) == 3:
-            return [videos]
 
     elif is_valid_image(videos):
-        if len(videos.shape) == 5:
-            return videos
+        if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3:
+            return [[videos]]
         elif len(videos.shape) == 4:
             return [videos]
+        elif len(videos.shape) == 5:
+            return videos
 
     raise ValueError(f"Could not make batched video from {videos}")