From a94cf092fc16f15013d4d09f1293f4102b1a47c6 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 9 Aug 2024 00:23:48 +0000
Subject: [PATCH 1/6] Add uniformize idefics processor kwargs and tests

---
 .../models/idefics/processing_idefics.py      | 157 +++++++++------
 .../models/idefics2/processing_idefics2.py    |  20 +-
 .../models/idefics/test_processor_idefics.py  | 190 +++++++++++++++++-
 3 files changed, 297 insertions(+), 70 deletions(-)
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 8e9e196764f923..49d322350d7b03 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -16,15 +16,23 @@
 Processor class for IDEFICS.
 """
 
-from typing import Callable, List, Optional, Union
+import sys
+import warnings
+from typing import List, Union
 from urllib.parse import urlparse
 
 from ...feature_extraction_utils import BatchFeature
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import is_tf_available, is_torch_available
+from ...utils.deprecation import deprecate_kwarg
 
 
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
 if is_torch_available():
     import torch
 
@@ -34,6 +42,16 @@
 IMAGE_TOKEN = "<image>"
 
 
+class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": False,
+            "padding": "longest",
+        },
+        "images_kwargs": {},
+    }
+
+
 # copied from m4.training.packing
 def incremental_to_binary_attention_mask(incremental_mask, return_tensors, num_classes=-1):
     # Set elements >= num_classes to -1
@@ -199,55 +217,35 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u
             else False
         )
 
+    @deprecate_kwarg(
+        old_name="transform", version="5.0.0", additional_message="Add kwargs to the image processor instead."
+    )
+    @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True)
     def __call__(
         self,
-        prompts: Union[List[TextInput], List[List[TextInput]]],
-        padding: Union[bool, str, PaddingStrategy] = "longest",
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        transform: Callable = None,
-        add_eos_token=False,
-        add_end_of_utterance_token=None,
-        debug=False,
-        return_tensors="pt",
+        images=None,
+        text: Union[
+            TextInput,
+            PreTokenizedInput,
+            List[TextInput],
+            List[PreTokenizedInput],
+            List[List[TextInput]],
+            List[List[PreTokenizedInput]],
+        ] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[IdeficsProcessorKwargs],
     ) -> BatchEncoding:
         """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
         the model was trained on and prepares the image pixel values for the model to process.
 
         Args:
-            prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
+            text (`Union[List[TextInput], [List[List[TextInput]]]]`):
                 either a single prompt or a batched list of prompts - see the detailed description immediately after
                 the end of the arguments doc section.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `"longest"`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'`: No padding. This will raise an error if the input sequences are of different
-                  lengths.
-                Note: Unlike most processors, which set padding=`False` by default, `IdeficsProcessor` sets `padding="longest"`
-                  by default. See https://github.com/huggingface/transformers/pull/29449#pullrequestreview-1925576061 for why.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            transform (`Callable`, *optional*):
-                A custom transform function that accepts a single image can be passed for training. For example,
-                `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
-                set of transforms will be applied to the images
-            add_eos_token (`bool`, *optional*, defaults to `False`):
-                Adds `eos_token` at the end of the final prompt if True`
-            add_end_of_utterance_token (`bool`, *optional*)
-                Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
-                image). If `None` the tokenizer will be checked instead and if this token is found in
-                `additional_special_tokens` then the value will be `True`.
-            debug (`bool`, *optional*, defaults to `False`):
-                `True` value will help debug prompt generation by dumping useful information
-            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
-                The type of tensors to return. Can be one of:
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+            images (`Union[PIL.Image, str, List[PIL.Image], List[str]]`):
+                either a single image or a batched list of images - can be passed in when text contains only text prompts,
+                in order to use the image-text-to-text behavior.
 
         Returns:
             a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
@@ -255,7 +253,7 @@ def __call__(
 
         Detailed explanation:
 
-        Each entry in `prompts` is either a text to be passed as is or an image that will be processed.
+        Each entry in `text` is either a text to be passed as is or an image that will be processed.
 
         An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
 
@@ -317,12 +315,60 @@ def __call__(
         In order to help debug prompt generation enable `debug=True` which will show you what's happening.
 
         """
+        if images is None and text is None:
+            raise ValueError("You need to specify either `text` or `images` and `text`.")
+        # for BC
+        if text is None:
+            # if the user didn't specify text=text in the call, we assume they want to use the old behavior
+            # with text (previously prompts) as a first argument
+            warnings.warn(
+                "The use of `text` as the first argument will be deprecated in the future. `images` is now the first argument."
+                "The first given argument will be considered as `prompts` in the old behavior.",
+            )
+            text = images
+            images = None
+        if images is None:
+            # assuming the user wants to use the old behavior with prompts as the only argument
+            prompts = text
+        elif text is not None:
+            # Assuming image-text-to-text behavior:
+            # Check if batched images are provided
+            if not isinstance(images, (list, tuple)):
+                images = [images]
+            if isinstance(text, str):
+                # one prompt for all images instead of one prompt per image
+                text = [text] * len(images)
+            # Check if batched text is provided
+            if isinstance(text, (list, tuple)) and len(text) != len(images):
+                raise ValueError(
+                    "When using the image-text-to-text behavior, the number of prompts should be the same as the number of images."
+                )
+            # Check that only text is present in the prompts
+            if not all(isinstance(i, str) for i in text):
+                raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.")
+            prompts = list(zip(images, text))
+
+        # for BC
+        transform = kwargs.pop("transform", None)
+        add_eos_token = kwargs.pop("add_eos_token", False)
+        add_end_of_utterance_token = kwargs.pop("add_end_of_utterance_token", None)
+
+        # Temporary fix for "paddding_side" in init_kwargs
+        _ = self.tokenizer.init_kwargs.pop("padding_side", None)
+
+        output_kwargs = self._merge_kwargs(
+            IdeficsProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if transform is not None:
+            output_kwargs["images_kwargs"]["transform"] = transform
 
         # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
         if add_end_of_utterance_token is None:
             add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
         # turn non-batched prompts into batched
-        if not any(isinstance(i, list) for i in prompts):
+        if not any(isinstance(i, (list, tuple)) for i in prompts):
             prompts = [prompts]
 
         fake_token = "<fake_token_around_image>"
@@ -371,21 +417,14 @@ def image_tokens(last_was_image):
             if add_eos_token:
                 full_text += self.tokenizer.eos_token
 
-            if debug is True:
-                print(f"{full_text=}")
-
-            image_objects = self.image_processor(image_objects, transform=transform, return_tensors=return_tensors)
+            image_objects = self.image_processor(image_objects, **output_kwargs["images_kwargs"])
 
             all_prompts.append(full_text)
             all_images.append(image_objects)
 
-        text_encoding = self.tokenizer(
-            text=all_prompts,
-            add_special_tokens=False,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-        )
+        # For BC
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", "pt")
+        text_encoding = self.tokenizer(all_prompts, **output_kwargs["text_kwargs"])
         all_texts = text_encoding["input_ids"]
         all_attention_masks = text_encoding["attention_mask"]
 
@@ -398,12 +437,12 @@ def image_tokens(last_was_image):
         output_images = []
         output_attention_masks = []
 
-        for text, attention_mask, images in zip(all_texts, all_attention_masks, all_images):
-            padded_input_ids = text
+        for text_single, attention_mask, extracted_images in zip(all_texts, all_attention_masks, all_images):
+            padded_input_ids = text_single
             image_count = padded_input_ids.count(self.image_token_id)
             local_max_num_images = min(image_count, max_num_images)
 
-            current_images = images[:local_max_num_images]
+            current_images = extracted_images[:local_max_num_images]
 
             if len(current_images) > 0:
                 if return_tensors == "pt":
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index 2e14118144baaa..cc61c670cc49fc 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -20,7 +20,7 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
-from ...processing_utils import ProcessorMixin
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import AddedToken, BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
 from ...utils import TensorType, logging
 
@@ -40,6 +40,24 @@ def is_image_or_image_url(elem):
     return is_url(elem) or is_valid_image(elem)
 
 
+class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_attention_mask": True,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": False,
+            "return_length": False,
+            "verbose": True,
+        },
+        "images_kwargs": {},
+    }
+
+
 class Idefics2Processor(ProcessorMixin):
     r"""
     Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.
diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py
index 26dcbb1c0f1566..31c9da7d750aa9 100644
--- a/tests/models/idefics/test_processor_idefics.py
+++ b/tests/models/idefics/test_processor_idefics.py
@@ -12,11 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import shutil
+import tempfile
+import unittest
+
 import numpy as np
 
-from transformers.testing_utils import TestCasePlus, require_torch, require_vision
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_torch_available():
     import torch
@@ -35,26 +41,29 @@
 
 @require_torch
 @require_vision
-class IdeficsProcessorTest(TestCasePlus):
-    def setUp(self):
-        super().setUp()
+class IdeficsProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = IdeficsProcessor
 
-        self.checkpoint_path = self.get_auto_remove_tmp_dir()
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
 
         image_processor = IdeficsImageProcessor(return_tensors="pt")
         tokenizer = LlamaTokenizerFast.from_pretrained("HuggingFaceM4/tiny-random-idefics")
 
         processor = IdeficsProcessor(image_processor, tokenizer)
 
-        processor.save_pretrained(self.checkpoint_path)
+        processor.save_pretrained(self.tmpdirname)
 
         self.input_keys = ["pixel_values", "input_ids", "attention_mask", "image_attention_mask"]
 
     def get_tokenizer(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).tokenizer
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
 
     def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).image_processor
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
 
     def prepare_prompts(self):
         """This function prepares a list of PIL images"""
@@ -100,13 +109,13 @@ def prepare_prompts(self):
 
     def test_save_load_pretrained_additional_features(self):
         processor = IdeficsProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
-        processor.save_pretrained(self.checkpoint_path)
+        processor.save_pretrained(self.tmpdirname)
 
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
         image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
 
         processor = IdeficsProcessor.from_pretrained(
-            self.checkpoint_path, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
         )
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
@@ -208,3 +217,164 @@ def test_model_input_names(self):
 
         # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
         self.assertSetEqual(set(inputs.keys()), set(self.input_keys))
+
+    @require_vision
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["input_ids"][0]), 117)
+
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        self.skipTest(reason="IdeficsImageProcessor kwargs are different from usual image processors")
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", image_size=234)
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)
+
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(
+            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
+        )
+        self.assertEqual(len(inputs["input_ids"][0]), 112)
+
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        self.skipTest(reason="IdeficsImageProcessor kwargs are different from usual image processors")
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", image_size=234)
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, image_size=224)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(len(inputs["input_ids"][0]), 8)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)

From 8af76fad5f61c2a639c9631255982df483f29741 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Fri, 9 Aug 2024 16:06:38 +0000
Subject: [PATCH 2/6] Uniformize idefics2 processor kwargs

---
 .../models/idefics/processing_idefics.py      |  33 ++-
 .../models/idefics2/processing_idefics2.py    |  84 +++---
 .../idefics2/test_processing_idefics2.py      | 278 +++++++++++++++---
 3 files changed, 296 insertions(+), 99 deletions(-)

diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 49d322350d7b03..e644d3356c01ef 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -18,11 +18,11 @@
 
 import sys
 import warnings
-from typing import List, Union
+from typing import Callable, List, Optional, Union
 from urllib.parse import urlparse
 
 from ...feature_extraction_utils import BatchFeature
-from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import is_tf_available, is_torch_available
 from ...utils.deprecation import deprecate_kwarg
@@ -42,11 +42,23 @@
 IMAGE_TOKEN = "<image>"
 
 
+class IdeficsImagesKwargs(ImagesKwargs, total=False):
+    transform: Optional[Callable]
+
+
+class IdeficsTextKwargs(TextKwargs, total=False):
+    add_eos_token: Optional[bool]
+    add_end_of_utterance_token: Optional[bool]
+
+
 class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: IdeficsTextKwargs
+    images_kwargs: IdeficsImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": False,
             "padding": "longest",
+            "add_eos_token": False,
         },
         "images_kwargs": {},
     }
@@ -217,9 +229,6 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u
             else False
         )
 
-    @deprecate_kwarg(
-        old_name="transform", version="5.0.0", additional_message="Add kwargs to the image processor instead."
-    )
     @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True)
     def __call__(
         self,
@@ -277,7 +286,7 @@ def __call__(
             "Describe this image.\nAssistant:",
         ]
 
-        inputs = processor(prompts, return_tensors="pt")
+        inputs = processor(text=prompts, return_tensors="pt")
         generated_ids = model.generate(**inputs, max_length=100)
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         ```
@@ -309,7 +318,7 @@ def __call__(
                 transforms.Normalize(mean=self.image_mean, std=self.image_std),
             ]
         )
-        inputs = processor(prompts, transform=image_transform, return_tensors="pt")
+        inputs = processor(text=prompts, transform=image_transform, return_tensors="pt")
         ```
 
         In order to help debug prompt generation enable `debug=True` which will show you what's happening.
@@ -348,11 +357,6 @@ def __call__(
                 raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.")
             prompts = list(zip(images, text))
 
-        # for BC
-        transform = kwargs.pop("transform", None)
-        add_eos_token = kwargs.pop("add_eos_token", False)
-        add_end_of_utterance_token = kwargs.pop("add_end_of_utterance_token", None)
-
         # Temporary fix for "paddding_side" in init_kwargs
         _ = self.tokenizer.init_kwargs.pop("padding_side", None)
 
@@ -361,8 +365,9 @@ def __call__(
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
-        if transform is not None:
-            output_kwargs["images_kwargs"]["transform"] = transform
+
+        add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
+        add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None)
 
         # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
         if add_end_of_utterance_token is None:
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index cc61c670cc49fc..b42a121eda26dc 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -16,18 +16,24 @@
 Processor class for IDEFICS2.
 """
 
+import sys
 from typing import TYPE_CHECKING, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
-from ...processing_utils import ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import AddedToken, BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput
+from ...utils import logging
 
 
 if TYPE_CHECKING:
     from ...tokenization_utils_base import PreTokenizedInput
 
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
 
 logger = logging.get_logger(__name__)
 
@@ -40,19 +46,18 @@ def is_image_or_image_url(elem):
     return is_url(elem) or is_valid_image(elem)
 
 
-class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
+class Idefics2ImagesKwargs(ImagesKwargs, total=False):
+    image_seq_len: Optional[int]
+
+
+class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Idefics2ImagesKwargs
+
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
             "padding": False,
-            "stride": 0,
-            "return_attention_mask": True,
-            "return_overflowing_tokens": False,
-            "return_special_tokens_mask": False,
-            "return_offsets_mapping": False,
-            "return_token_type_ids": False,
-            "return_length": False,
-            "verbose": True,
+            "is_split_into_words": False,
         },
         "images_kwargs": {},
     }
@@ -115,15 +120,11 @@ def _extract_images_from_prompts(self, prompts):
 
     def __call__(
         self,
-        text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
         images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None,
-        image_seq_len: Optional[int] = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        is_split_into_words: bool = False,
-        add_special_tokens: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Idefics2ProcessorKwargs],
     ) -> BatchEncoding:
         """
         Processes the input prompts and returns a BatchEncoding.
@@ -156,6 +157,9 @@ def __call__(
         ```
 
         Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
             text (`Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
@@ -163,27 +167,15 @@ def __call__(
 
                 Wherever an image token, `<image>` is encountered it is expanded to
                 `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            image_seq_len (`int`, *optional*):
-                The length of the image sequence. If not provided, the default value is used.
-            padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `False`):
-                Padding strategy applied to the input ids. See [`PreTrainedTokenizerFast.pad`] for more information.
-            truncation (`Union[bool, str, TruncationStrategy]`, *optional*):
-                Truncation strategy applied to the input ids. See [`PreTrainedTokenizerFast.truncate`] for more information.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding/truncation length. See
-                [`PreTrainedTokenizerFast.__call__`] for more information.
-            is_split_into_words (`bool`, *optional*, defaults to `False`):
-                Whether the input text is split into words or not. If set to `True`, the tokenizer will skip the
-                tokenization process and assume the input is already tokenized.
-            add_special_tokens (`bool`, *optional*, defaults to `True`):
-                Whether to add special tokens or not. See [`PreTrainedTokenizerFast.__call__`] for more information.
-            return_tensors (`Union[str, TensorType]`, *optional*):
-                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
-                information.
+
         """
+
+        output_kwargs = self._merge_kwargs(
+            Idefics2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
         image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
 
         n_images_in_text = []
@@ -212,15 +204,7 @@ def __call__(
                 sample = sample.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
                 prompt_strings.append(sample)
 
-            text_inputs = self.tokenizer(
-                text=prompt_strings,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                is_split_into_words=is_split_into_words,
-                return_tensors=return_tensors,
-            )
+            text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
             inputs.update(text_inputs)
 
         if images is not None:
@@ -245,7 +229,7 @@ def __call__(
 
             # Load images if they are URLs
             images = [[load_image(im) for im in sample] for sample in images]
-            image_inputs = self.image_processor(images, return_tensors=return_tensors)
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
             inputs.update(image_inputs)
 
         return inputs
diff --git a/tests/models/idefics2/test_processing_idefics2.py b/tests/models/idefics2/test_processing_idefics2.py
index 2fd569f99141af..e8292807626bf2 100644
--- a/tests/models/idefics2/test_processing_idefics2.py
+++ b/tests/models/idefics2/test_processing_idefics2.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import shutil
+import tempfile
 import unittest
 from io import BytesIO
 
@@ -22,16 +24,30 @@
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
 
+    from transformers import (
+        AutoProcessor,
+        Idefics2Processor,
+    )
+
 
 @require_torch
 @require_vision
-class Idefics2ProcessorTest(unittest.TestCase):
+class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Idefics2Processor
+
     def setUp(self):
-        self.processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
+        self.tmpdirname = tempfile.mkdtemp()
+
+        processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
+
+        processor.save_pretrained(self.tmpdirname)
+
         self.image1 = Image.open(
             BytesIO(
                 requests.get(
@@ -49,22 +65,35 @@ def setUp(self):
                 ).content
             )
         )
-        self.bos_token = self.processor.tokenizer.bos_token
-        self.image_token = self.processor.image_token.content
-        self.fake_image_token = self.processor.fake_image_token.content
+        self.bos_token = processor.tokenizer.bos_token
+        self.image_token = processor.image_token.content
+        self.fake_image_token = processor.fake_image_token.content
+
+        self.bos_token_id = processor.tokenizer.convert_tokens_to_ids(self.bos_token)
+        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(self.image_token)
+        self.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
+        self.image_seq_len = processor.image_seq_len
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
 
-        self.bos_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.bos_token)
-        self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.image_token)
-        self.fake_image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
-        self.image_seq_len = self.processor.image_seq_len
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def get_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
 
     def test_process_interleaved_images_prompts_no_image_splitting(self):
-        old_image_splitting = self.processor.image_processor.do_image_splitting
+        tokenizer = self.get_tokenizer()
+        processor = self.get_processor()
 
-        self.processor.image_processor.do_image_splitting = False
+        processor.image_processor.do_image_splitting = False
 
         # Test that a single image is processed correctly
-        inputs = self.processor(images=self.image1)
+        inputs = processor(images=self.image1)
         self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980))
         self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980))
         # fmt: on
@@ -73,10 +102,10 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         image_str = "<image>"
         text_str = "In this image, we see"
         text = image_str + text_str
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
 
         # fmt: off
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        tokenized_sentence = tokenizer(text_str, add_special_tokens=False)
         expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
@@ -95,11 +124,11 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         ]
         images = [[self.image1], [self.image2, self.image3]]
 
-        inputs = self.processor(text=text, images=images, padding=True)
+        inputs = processor(text=text, images=images, padding=True)
 
         # fmt: off
-        tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
-        tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
+        tokenized_sentence_1 = tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = tokenizer(text_str_2, add_special_tokens=False)
         expected_input_ids_1 = [self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
         expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
         # Pad the first input to match the second input
@@ -117,15 +146,13 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 767, 980))
         # fmt: on
 
-        self.processor.image_processor.do_image_splitting = old_image_splitting
-
     def test_process_interleaved_images_prompts_image_splitting(self):
-        old_image_splitting = self.processor.image_processor.do_image_splitting
-
-        self.processor.image_processor.do_image_splitting = True
+        processor = self.get_processor()
+        tokenizer = self.get_tokenizer()
+        processor.image_processor.do_image_splitting = True
 
         # Test that a single image is processed correctly
-        inputs = self.processor(images=self.image1)
+        inputs = processor(images=self.image1)
         self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980))
         self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980))
         # fmt: on
@@ -134,10 +161,10 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         image_str = "<image>"
         text_str = "In this image, we see"
         text = image_str + text_str
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
 
         # fmt: off
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        tokenized_sentence = tokenizer(text_str, add_special_tokens=False)
         expected_input_ids = [[self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
@@ -156,11 +183,11 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         ]
         images = [[self.image1], [self.image2, self.image3]]
 
-        inputs = self.processor(text=text, images=images, padding=True)
+        inputs = processor(text=text, images=images, padding=True)
 
         # fmt: off
-        tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
-        tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
+        tokenized_sentence_1 = tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = tokenizer(text_str_2, add_special_tokens=False)
         expected_input_ids_1 = [self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
         expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id]
         # Pad the first input to match the second input
@@ -178,22 +205,22 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 10, 767, 980))
         # fmt: on
 
-        self.processor.image_processor.do_image_splitting = old_image_splitting
-
     def test_add_special_tokens_processor(self):
+        processor = self.get_processor()
+        tokenizer = self.get_tokenizer()
         image_str = "<image>"
         text_str = "In this image, we see"
         text = text_str + image_str
 
-        n_image_repeat = 5 if self.processor.image_processor.do_image_splitting else 1
+        n_image_repeat = 5 if processor.image_processor.do_image_splitting else 1
 
         # fmt: off
-        inputs = self.processor(text=text, images=self.image1, add_special_tokens=False)
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        inputs = processor(text=text, images=self.image1, add_special_tokens=False)
+        tokenized_sentence = tokenizer(text_str, add_special_tokens=False)
         expected_input_ids = [tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
 
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
         expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         # fmt: on
@@ -222,7 +249,7 @@ def test_apply_chat_template(self):
             {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
         ]
 
-        processor = self.processor
+        processor = self.get_processor()
         # Make short sequence length to test that the fake tokens are added correctly
         rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
 
@@ -233,3 +260,184 @@ def test_apply_chat_template(self):
             "Assistant:"
         )
         self.assertEqual(rendered, expected_rendered)
+
+    @require_vision
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        image_str = "<image>"
+        input_str = image_str + "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["input_ids"][0]), 117)
+
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", size={"height": 234, "width": 234})
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        image_str = "<image>"
+        input_str = image_str + "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)
+
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        image_str = "<image>"
+        input_str = image_str + "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(
+            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
+        )
+        self.assertEqual(len(inputs["input_ids"][0]), 112)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", size={"height": 234, "width": 234})
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        image_str = "<image>"
+        input_str = image_str + "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, size={"height": 224, "width": 224})
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        image_str = "<image>"
+        input_str = image_str + "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            size={"height": 214, "width": 214},
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        image_str = "<image>"
+        input_str = [image_str + "lower newer", image_str + "upper older longer string"]
+        image_input = [self.prepare_image_inputs()] * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 21)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        image_str = "<image>"
+        input_str = image_str + "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        image_str = "<image>"
+        input_str = image_str + "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)

From e6747ff8b403d143b0d56892368fdb426a550dbb Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Mon, 12 Aug 2024 22:21:54 +0000
Subject: [PATCH 3/6] add image_processor tests idefics

---
 .../models/idefics/processing_idefics.py             |  6 +++++-
 tests/models/idefics/test_processor_idefics.py       | 12 ++++++++----
 tests/models/idefics2/test_processing_idefics2.py    |  4 ----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index e644d3356c01ef..3a7fcc80ffb225 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -18,7 +18,7 @@
 
 import sys
 import warnings
-from typing import Callable, List, Optional, Union
+from typing import Callable, Dict, List, Optional, Union
 from urllib.parse import urlparse
 
 from ...feature_extraction_utils import BatchFeature
@@ -44,6 +44,9 @@
 
 class IdeficsImagesKwargs(ImagesKwargs, total=False):
     transform: Optional[Callable]
+    image_size: Optional[Dict[str, int]]
+    image_mean: Optional[Union[float, List[float]]]
+    image_std: Optional[Union[float, List[float]]]
 
 
 class IdeficsTextKwargs(TextKwargs, total=False):
@@ -61,6 +64,7 @@ class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
             "add_eos_token": False,
         },
         "images_kwargs": {},
+        "common_kwargs": {"return_tensors": "pt"},
     }
 
 
diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py
index 31c9da7d750aa9..bdb5554b9402d7 100644
--- a/tests/models/idefics/test_processor_idefics.py
+++ b/tests/models/idefics/test_processor_idefics.py
@@ -235,7 +235,6 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         self.assertEqual(len(inputs["input_ids"][0]), 117)
 
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        self.skipTest(reason="IdeficsImageProcessor kwargs are different from usual image processors")
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", image_size=234)
@@ -269,7 +268,6 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         self.assertEqual(len(inputs["input_ids"][0]), 112)
 
     def test_kwargs_overrides_default_image_processor_kwargs(self):
-        self.skipTest(reason="IdeficsImageProcessor kwargs are different from usual image processors")
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", image_size=234)
@@ -301,10 +299,12 @@ def test_unstructured_kwargs(self):
             text=input_str,
             images=image_input,
             return_tensors="pt",
+            image_size=214,
             padding="max_length",
             max_length=76,
         )
 
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
         self.assertEqual(len(inputs["input_ids"][0]), 76)
 
     @require_torch
@@ -324,10 +324,12 @@ def test_unstructured_kwargs_batched(self):
             text=input_str,
             images=image_input,
             return_tensors="pt",
+            image_size=214,
             padding="longest",
             max_length=76,
         )
 
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
         self.assertEqual(len(inputs["input_ids"][0]), 8)
 
     @require_torch
@@ -347,12 +349,13 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"image_size": 214},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
-
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
         self.assertEqual(len(inputs["input_ids"][0]), 76)
 
     @require_torch
@@ -372,9 +375,10 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"image_size": 214},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
         self.assertEqual(len(inputs["input_ids"][0]), 76)
diff --git a/tests/models/idefics2/test_processing_idefics2.py b/tests/models/idefics2/test_processing_idefics2.py
index e8292807626bf2..25ab3bd67c5f4e 100644
--- a/tests/models/idefics2/test_processing_idefics2.py
+++ b/tests/models/idefics2/test_processing_idefics2.py
@@ -383,7 +383,6 @@ def test_unstructured_kwargs_batched(self):
         )
 
         self.assertEqual(inputs["pixel_values"].shape[3], 214)
-
         self.assertEqual(len(inputs["input_ids"][0]), 21)
 
     @require_torch
@@ -410,9 +409,7 @@ def test_structured_kwargs_nested(self):
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
-
         self.assertEqual(inputs["pixel_values"].shape[3], 214)
-
         self.assertEqual(len(inputs["input_ids"][0]), 76)
 
     @require_torch
@@ -439,5 +436,4 @@ def test_structured_kwargs_nested_from_dict(self):
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.assertEqual(inputs["pixel_values"].shape[3], 214)
-
         self.assertEqual(len(inputs["input_ids"][0]), 76)

From 1a231cbfabede4c2390e54a2f6bae3958c2b4190 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Mon, 12 Aug 2024 22:34:32 +0000
Subject: [PATCH 4/6] add BC args order change idefics2 processor and update
 doc

---
 .../models/idefics2/modeling_idefics2.py      |  2 +-
 .../models/idefics2/processing_idefics2.py    | 21 ++++++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index cdc7e9ba4e77b0..dca3b177cd3fc1 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -1558,7 +1558,7 @@ def forward(
         ...   "In which city is that bridge located?<image>",
         ... ]
         >>> images = [[image1, image2], [image3]]
-        >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to("cuda")
+        >>> inputs = processor(images=images, text=prompts, padding=True, return_tensors="pt").to("cuda")
 
         >>> # Generate
         >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20)
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index b42a121eda26dc..96b7c7601cac12 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -17,6 +17,7 @@
 """
 
 import sys
+import warnings
 from typing import TYPE_CHECKING, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
@@ -149,7 +150,7 @@ def __call__(
         ...     "<image>In this image, we see",
         ...     "bla bla bla<image>",
         ... ]
-        >>> outputs = processor(text=text, images=images, return_tensors="pt", padding=True)
+        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
         >>> input_ids = outputs.input_ids
         >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
         >>> print(input_tokens)
@@ -169,6 +170,24 @@ def __call__(
                 `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
 
         """
+        if text is None and images is None:
+            raise ValueError("You must provide either `text` or `images`.")
+        # check if images and text inputs are reversed for BC
+        if (
+            text is not None
+            and not isinstance(text[0], str)
+            or images is not None
+            and not (
+                is_image_or_image_url(images)
+                or is_image_or_image_url(images[0])
+                or (isinstance(images[0], list) and is_image_or_image_url(images[0][0]))
+            )
+        ):
+            warnings.warn(
+                "It looks like you are passing the inputs in the wrong order. You should pass the images input first and the text input second."
+                "Images and text inputs will be swapped."
+            )
+            images, text = text, images
 
         output_kwargs = self._merge_kwargs(
             Idefics2ProcessorKwargs,

From 8b171a777bac10bbb9c9a13bd36d6ffd10be9b9d Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 14 Aug 2024 14:13:17 +0000
Subject: [PATCH 5/6] Add support for multiple images per prompt in
 image-text-to-text mode idefics

---
 .../models/idefics/processing_idefics.py         | 16 +++++++++-------
 .../idefics/test_image_processing_idefics.py     |  1 +
 tests/models/idefics/test_processor_idefics.py   | 15 +++++++--------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 3a7fcc80ffb225..7f6ff773d8ee5b 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -349,26 +349,28 @@ def __call__(
             if not isinstance(images, (list, tuple)):
                 images = [images]
             if isinstance(text, str):
-                # one prompt for all images instead of one prompt per image
-                text = [text] * len(images)
-            # Check if batched text is provided
+                text = [text]
+            # Check if batched images and text are in the correct format
             if isinstance(text, (list, tuple)) and len(text) != len(images):
                 raise ValueError(
-                    "When using the image-text-to-text behavior, the number of prompts should be the same as the number of images."
+                    "When providing both images and text arguments, the number of text prompts should be the same as the number of images."
+                    "If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...]."
                 )
             # Check that only text is present in the prompts
             if not all(isinstance(i, str) for i in text):
                 raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.")
+            if isinstance(images[0], (list, tuple)):
+                # if nested images, nest text as well
+                text = [[i] for i in text]
             prompts = list(zip(images, text))
 
-        # Temporary fix for "paddding_side" in init_kwargs
-        _ = self.tokenizer.init_kwargs.pop("padding_side", None)
-
         output_kwargs = self._merge_kwargs(
             IdeficsProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+        # Temporary fix for "paddding_side" in init_kwargs
+        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
 
         add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
         add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None)
diff --git a/tests/models/idefics/test_image_processing_idefics.py b/tests/models/idefics/test_image_processing_idefics.py
index 2f7a8993df5348..b567d97a13be67 100644
--- a/tests/models/idefics/test_image_processing_idefics.py
+++ b/tests/models/idefics/test_image_processing_idefics.py
@@ -49,6 +49,7 @@ def __init__(
         image_mean=[0.48145466, 0.4578275, 0.40821073],
         image_std=[0.26862954, 0.26130258, 0.27577711],
     ):
+        super().__init__()
         size = size if size is not None else {"shortest_edge": 30}
         self.parent = parent
         self.batch_size = batch_size
diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py
index bdb5554b9402d7..e658be7ac9f26a 100644
--- a/tests/models/idefics/test_processor_idefics.py
+++ b/tests/models/idefics/test_processor_idefics.py
@@ -18,6 +18,13 @@
 
 import numpy as np
 
+from transformers import (
+    AutoProcessor,
+    IdeficsImageProcessor,
+    IdeficsProcessor,
+    LlamaTokenizerFast,
+    PreTrainedTokenizerFast,
+)
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -30,14 +37,6 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import (
-        AutoProcessor,
-        IdeficsImageProcessor,
-        IdeficsProcessor,
-        LlamaTokenizerFast,
-        PreTrainedTokenizerFast,
-    )
-
 
 @require_torch
 @require_vision

From 747fbe1f38050c063a23e6928a45e37ab2338bf2 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Thu, 15 Aug 2024 23:32:22 +0000
Subject: [PATCH 6/6] Fix processor input args in idefics tests

---
 tests/models/idefics/test_modeling_idefics.py  |  2 +-
 tests/models/idefics/test_processor_idefics.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index 0197ebcaff5388..a49bce8d878fb4 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -662,7 +662,7 @@ def test_inference_natural_language_visual_reasoning(self):
             "HuggingFaceM4/idefics-9b", quantization_config=quantization_config, device_map="auto"
         )
         processor = self.default_processor
-        inputs = processor(prompts, return_tensors="pt", padding="longest").to(torch_device)
+        inputs = processor(text=prompts, return_tensors="pt", padding="longest").to(torch_device)
         generated_ids = model.generate(**inputs, max_length=100)
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
 
diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py
index e658be7ac9f26a..c88030775b0143 100644
--- a/tests/models/idefics/test_processor_idefics.py
+++ b/tests/models/idefics/test_processor_idefics.py
@@ -132,7 +132,7 @@ def test_processor(self):
         prompts = self.prepare_prompts()
 
         # test that all prompts succeeded
-        input_processor = processor(prompts, return_tensors="pt", padding="longest")
+        input_processor = processor(text=prompts, return_tensors="pt", padding="longest")
         for key in self.input_keys:
             assert torch.is_tensor(input_processor[key])
 
@@ -165,8 +165,8 @@ def test_tokenizer_padding(self):
         ]
         prompts = [[prompt] for prompt in self.prepare_prompts()[2]]
 
-        max_length = processor(prompts, padding="max_length", truncation=True, max_length=20, return_tensors="pt")
-        longest = processor(prompts, padding="longest", truncation=True, max_length=30, return_tensors="pt")
+        max_length = processor(text=prompts, padding="max_length", truncation=True, max_length=20, return_tensors="pt")
+        longest = processor(text=prompts, padding="longest", truncation=True, max_length=30, return_tensors="pt")
 
         decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1])
         decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1])
@@ -193,8 +193,8 @@ def test_tokenizer_left_padding(self):
             ([0] * 10) + ([1] * 10),
         ]
         prompts = [[prompt] for prompt in self.prepare_prompts()[2]]
-        max_length = processor(prompts, padding="max_length", truncation=True, max_length=20)
-        longest = processor(prompts, padding="longest", truncation=True, max_length=30)
+        max_length = processor(text=prompts, padding="max_length", truncation=True, max_length=20)
+        longest = processor(text=prompts, padding="longest", truncation=True, max_length=30)
 
         decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1])
         decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1])
@@ -212,7 +212,7 @@ def test_model_input_names(self):
         processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor)
         prompts = self.prepare_prompts()
 
-        inputs = processor(prompts, padding="longest", return_tensors="pt")
+        inputs = processor(text=prompts, padding="longest", return_tensors="pt")
 
         # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
         self.assertSetEqual(set(inputs.keys()), set(self.input_keys))