diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 8e9e196764f923..3406ab2226e08b 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -16,13 +16,21 @@
 Processor class for IDEFICS.
 """
 
-from typing import Callable, List, Optional, Union
+from typing import Callable, Dict, List, Optional, Union
 from urllib.parse import urlparse
 
 from ...feature_extraction_utils import BatchFeature
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
+from ...processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    TextKwargs,
+    Unpack,
+    _validate_images_text_input_order,
+)
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_tf_available, is_torch_available
+from ...utils.deprecation import deprecate_kwarg
 
 
 if is_torch_available():
@@ -34,6 +42,32 @@
 IMAGE_TOKEN = "<image>"
 
 
+class IdeficsImagesKwargs(ImagesKwargs, total=False):
+    transform: Optional[Callable]
+    image_size: Optional[Dict[str, int]]
+    image_mean: Optional[Union[float, List[float]]]
+    image_std: Optional[Union[float, List[float]]]
+
+
+class IdeficsTextKwargs(TextKwargs, total=False):
+    add_eos_token: Optional[bool]
+    add_end_of_utterance_token: Optional[bool]
+
+
+class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: IdeficsTextKwargs
+    images_kwargs: IdeficsImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": False,
+            "padding": "longest",
+            "add_eos_token": False,
+        },
+        "images_kwargs": {},
+        "common_kwargs": {"return_tensors": "pt"},
+    }
+
+
 # copied from m4.training.packing
 def incremental_to_binary_attention_mask(incremental_mask, return_tensors, num_classes=-1):
     # Set elements >= num_classes to -1
@@ -199,52 +233,32 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u
             else False
         )
 
+    @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True)
     def __call__(
         self,
-        prompts: Union[List[TextInput], List[List[TextInput]]],
-        padding: Union[bool, str, PaddingStrategy] = "longest",
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        transform: Callable = None,
-        add_eos_token=False,
-        add_end_of_utterance_token=None,
-        debug=False,
-        return_tensors="pt",
-    ) -> BatchEncoding:
+        images=None,
+        text: Union[
+            TextInput,
+            PreTokenizedInput,
+            List[TextInput],
+            List[PreTokenizedInput],
+            List[List[TextInput]],
+            List[List[PreTokenizedInput]],
+        ] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[IdeficsProcessorKwargs],
+    ) -> BatchFeature:
         """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
         the model was trained on and prepares the image pixel values for the model to process.
 
         Args:
-            prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
+            images (`Union[PIL.Image, str, List[PIL.Image], List[str]]`):
+                either a single image or a batched list of images - can be passed in when text contains only text prompts,
+                in order to use the image-text-to-text behavior.
+            text (`Union[List[TextInput], [List[List[TextInput]]]]`):
                 either a single prompt or a batched list of prompts - see the detailed description immediately after
                 the end of the arguments doc section.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `"longest"`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'`: No padding. This will raise an error if the input sequences are of different
-                  lengths.
-                Note: Unlike most processors, which set padding=`False` by default, `IdeficsProcessor` sets `padding="longest"`
-                  by default. See https://github.com/huggingface/transformers/pull/29449#pullrequestreview-1925576061 for why.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            transform (`Callable`, *optional*):
-                A custom transform function that accepts a single image can be passed for training. For example,
-                `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
-                set of transforms will be applied to the images
-            add_eos_token (`bool`, *optional*, defaults to `False`):
-                Adds `eos_token` at the end of the final prompt if True`
-            add_end_of_utterance_token (`bool`, *optional*)
-                Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
-                image). If `None` the tokenizer will be checked instead and if this token is found in
-                `additional_special_tokens` then the value will be `True`.
-            debug (`bool`, *optional*, defaults to `False`):
-                `True` value will help debug prompt generation by dumping useful information
             return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
                 The type of tensors to return. Can be one of:
                     - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
@@ -255,7 +269,7 @@ def __call__(
 
         Detailed explanation:
 
-        Each entry in `prompts` is either a text to be passed as is or an image that will be processed.
+        Each entry in `text` is either a text to be passed as is or an image that will be processed.
 
         An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
 
@@ -279,7 +293,7 @@ def __call__(
             "Describe this image.\nAssistant:",
         ]
 
-        inputs = processor(prompts, return_tensors="pt")
+        inputs = processor(text=prompts, return_tensors="pt")
         generated_ids = model.generate(**inputs, max_length=100)
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         ```
@@ -311,18 +325,55 @@ def __call__(
                 transforms.Normalize(mean=self.image_mean, std=self.image_std),
             ]
         )
-        inputs = processor(prompts, transform=image_transform, return_tensors="pt")
+        inputs = processor(text=prompts, transform=image_transform, return_tensors="pt")
         ```
 
         In order to help debug prompt generation enable `debug=True` which will show you what's happening.
 
         """
+        if images is None and text is None:
+            raise ValueError("You need to specify either `text` or `images` and `text`.")
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        if images is None:
+            # assuming the user wants to use the old behavior with prompts as the only argument
+            prompts = text
+        elif text is not None:
+            # Assuming image-text-to-text behavior:
+            # Check if batched images are provided
+            if not isinstance(images, (list, tuple)):
+                images = [images]
+            if isinstance(text, str):
+                text = [text]
+            # Check if batched images and text are in the correct format
+            if isinstance(text, (list, tuple)) and len(text) != len(images):
+                raise ValueError(
+                    "When providing both images and text arguments, the number of text prompts should be the same as the number of images."
+                    "If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...]."
+                )
+            # Check that only text is present in the prompts
+            if not all(isinstance(i, str) for i in text):
+                raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.")
+            if isinstance(images[0], (list, tuple)):
+                # if nested images, nest text as well
+                text = [[i] for i in text]
+            prompts = list(zip(images, text))
+
+        output_kwargs = self._merge_kwargs(
+            IdeficsProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
+        add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None)
 
         # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
         if add_end_of_utterance_token is None:
             add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
         # turn non-batched prompts into batched
-        if not any(isinstance(i, list) for i in prompts):
+        if not any(isinstance(i, (list, tuple)) for i in prompts):
             prompts = [prompts]
 
         fake_token = "<fake_token_around_image>"
@@ -371,21 +422,14 @@ def image_tokens(last_was_image):
             if add_eos_token:
                 full_text += self.tokenizer.eos_token
 
-            if debug is True:
-                print(f"{full_text=}")
-
-            image_objects = self.image_processor(image_objects, transform=transform, return_tensors=return_tensors)
+            image_objects = self.image_processor(image_objects, **output_kwargs["images_kwargs"])
 
             all_prompts.append(full_text)
             all_images.append(image_objects)
 
-        text_encoding = self.tokenizer(
-            text=all_prompts,
-            add_special_tokens=False,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-        )
+        # For BC
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", "pt")
+        text_encoding = self.tokenizer(all_prompts, **output_kwargs["text_kwargs"])
         all_texts = text_encoding["input_ids"]
         all_attention_masks = text_encoding["attention_mask"]
 
@@ -398,12 +442,12 @@ def image_tokens(last_was_image):
         output_images = []
         output_attention_masks = []
 
-        for text, attention_mask, images in zip(all_texts, all_attention_masks, all_images):
-            padded_input_ids = text
+        for text_single, attention_mask, extracted_images in zip(all_texts, all_attention_masks, all_images):
+            padded_input_ids = text_single
             image_count = padded_input_ids.count(self.image_token_id)
             local_max_num_images = min(image_count, max_num_images)
 
-            current_images = images[:local_max_num_images]
+            current_images = extracted_images[:local_max_num_images]
 
             if len(current_images) > 0:
                 if return_tensors == "pt":
diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index 056811138155f3..e4079a2368fdb1 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -1584,7 +1584,7 @@ def forward(
         ...   "In which city is that bridge located?<image>",
         ... ]
         >>> images = [[image1, image2], [image3]]
-        >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to("cuda")
+        >>> inputs = processor(images=images, text=prompts, padding=True, return_tensors="pt").to("cuda")
 
         >>> # Generate
         >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20)
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index 2e14118144baaa..68566d182678c2 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -20,9 +20,15 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import AddedToken, BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
+from ...processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+    _validate_images_text_input_order,
+)
+from ...tokenization_utils_base import AddedToken, TextInput
+from ...utils import logging
 
 
 if TYPE_CHECKING:
@@ -40,6 +46,23 @@ def is_image_or_image_url(elem):
     return is_url(elem) or is_valid_image(elem)
 
 
+class Idefics2ImagesKwargs(ImagesKwargs, total=False):
+    image_seq_len: Optional[int]
+
+
+class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Idefics2ImagesKwargs
+
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "is_split_into_words": False,
+        },
+        "images_kwargs": {},
+    }
+
+
 class Idefics2Processor(ProcessorMixin):
     r"""
     Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.
@@ -97,16 +120,12 @@ def _extract_images_from_prompts(self, prompts):
 
     def __call__(
         self,
-        text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
         images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None,
-        image_seq_len: Optional[int] = None,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        is_split_into_words: bool = False,
-        add_special_tokens: bool = True,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> BatchEncoding:
+        text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Idefics2ProcessorKwargs],
+    ) -> BatchFeature:
         """
         Processes the input prompts and returns a BatchEncoding.
 
@@ -130,7 +149,7 @@ def __call__(
         ...     "<image>In this image, we see",
         ...     "bla bla bla<image>",
         ... ]
-        >>> outputs = processor(text=text, images=images, return_tensors="pt", padding=True)
+        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
         >>> input_ids = outputs.input_ids
         >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
         >>> print(input_tokens)
@@ -138,6 +157,9 @@ def __call__(
         ```
 
         Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
             text (`Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
@@ -145,27 +167,22 @@ def __call__(
 
                 Wherever an image token, `<image>` is encountered it is expanded to
                 `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            image_seq_len (`int`, *optional*):
-                The length of the image sequence. If not provided, the default value is used.
-            padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `False`):
-                Padding strategy applied to the input ids. See [`PreTrainedTokenizerFast.pad`] for more information.
-            truncation (`Union[bool, str, TruncationStrategy]`, *optional*):
-                Truncation strategy applied to the input ids. See [`PreTrainedTokenizerFast.truncate`] for more information.
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding/truncation length. See
-                [`PreTrainedTokenizerFast.__call__`] for more information.
-            is_split_into_words (`bool`, *optional*, defaults to `False`):
-                Whether the input text is split into words or not. If set to `True`, the tokenizer will skip the
-                tokenization process and assume the input is already tokenized.
-            add_special_tokens (`bool`, *optional*, defaults to `True`):
-                Whether to add special tokens or not. See [`PreTrainedTokenizerFast.__call__`] for more information.
             return_tensors (`Union[str, TensorType]`, *optional*):
                 If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
                 information.
+
         """
+        if text is None and images is None:
+            raise ValueError("You must provide either `text` or `images`.")
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+
+        output_kwargs = self._merge_kwargs(
+            Idefics2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
         image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
 
         n_images_in_text = []
@@ -194,15 +211,7 @@ def __call__(
                 sample = sample.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
                 prompt_strings.append(sample)
 
-            text_inputs = self.tokenizer(
-                text=prompt_strings,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                is_split_into_words=is_split_into_words,
-                return_tensors=return_tensors,
-            )
+            text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
             inputs.update(text_inputs)
 
         if images is not None:
@@ -227,7 +236,7 @@ def __call__(
 
             # Load images if they are URLs
             images = [[load_image(im) for im in sample] for sample in images]
-            image_inputs = self.image_processor(images, return_tensors=return_tensors)
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
             inputs.update(image_inputs)
 
         return inputs
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index 0197ebcaff5388..a49bce8d878fb4 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -662,7 +662,7 @@ def test_inference_natural_language_visual_reasoning(self):
             "HuggingFaceM4/idefics-9b", quantization_config=quantization_config, device_map="auto"
         )
         processor = self.default_processor
-        inputs = processor(prompts, return_tensors="pt", padding="longest").to(torch_device)
+        inputs = processor(text=prompts, return_tensors="pt", padding="longest").to(torch_device)
         generated_ids = model.generate(**inputs, max_length=100)
         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
 
diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py
index 26dcbb1c0f1566..062b578a684ede 100644
--- a/tests/models/idefics/test_processor_idefics.py
+++ b/tests/models/idefics/test_processor_idefics.py
@@ -12,11 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import shutil
+import tempfile
+import unittest
+
 import numpy as np
 
-from transformers.testing_utils import TestCasePlus, require_torch, require_vision
+from transformers import (
+    AutoProcessor,
+    IdeficsImageProcessor,
+    IdeficsProcessor,
+    LlamaTokenizerFast,
+    PreTrainedTokenizerFast,
+)
+from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_torch_available():
     import torch
@@ -24,37 +37,32 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import (
-        AutoProcessor,
-        IdeficsImageProcessor,
-        IdeficsProcessor,
-        LlamaTokenizerFast,
-        PreTrainedTokenizerFast,
-    )
-
 
 @require_torch
 @require_vision
-class IdeficsProcessorTest(TestCasePlus):
-    def setUp(self):
-        super().setUp()
+class IdeficsProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = IdeficsProcessor
 
-        self.checkpoint_path = self.get_auto_remove_tmp_dir()
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
 
         image_processor = IdeficsImageProcessor(return_tensors="pt")
         tokenizer = LlamaTokenizerFast.from_pretrained("HuggingFaceM4/tiny-random-idefics")
 
         processor = IdeficsProcessor(image_processor, tokenizer)
 
-        processor.save_pretrained(self.checkpoint_path)
+        processor.save_pretrained(self.tmpdirname)
 
         self.input_keys = ["pixel_values", "input_ids", "attention_mask", "image_attention_mask"]
 
     def get_tokenizer(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).tokenizer
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
 
     def get_image_processor(self, **kwargs):
-        return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).image_processor
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
 
     def prepare_prompts(self):
         """This function prepares a list of PIL images"""
@@ -100,13 +108,13 @@ def prepare_prompts(self):
 
     def test_save_load_pretrained_additional_features(self):
         processor = IdeficsProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor())
-        processor.save_pretrained(self.checkpoint_path)
+        processor.save_pretrained(self.tmpdirname)
 
         tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
         image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
 
         processor = IdeficsProcessor.from_pretrained(
-            self.checkpoint_path, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
         )
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
@@ -124,7 +132,7 @@ def test_processor(self):
         prompts = self.prepare_prompts()
 
         # test that all prompts succeeded
-        input_processor = processor(prompts, return_tensors="pt", padding="longest")
+        input_processor = processor(text=prompts, return_tensors="pt", padding="longest")
         for key in self.input_keys:
             assert torch.is_tensor(input_processor[key])
 
@@ -157,8 +165,8 @@ def test_tokenizer_padding(self):
         ]
         prompts = [[prompt] for prompt in self.prepare_prompts()[2]]
 
-        max_length = processor(prompts, padding="max_length", truncation=True, max_length=20, return_tensors="pt")
-        longest = processor(prompts, padding="longest", truncation=True, max_length=30, return_tensors="pt")
+        max_length = processor(text=prompts, padding="max_length", truncation=True, max_length=20, return_tensors="pt")
+        longest = processor(text=prompts, padding="longest", truncation=True, max_length=30, return_tensors="pt")
 
         decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1])
         decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1])
@@ -185,8 +193,8 @@ def test_tokenizer_left_padding(self):
             ([0] * 10) + ([1] * 10),
         ]
         prompts = [[prompt] for prompt in self.prepare_prompts()[2]]
-        max_length = processor(prompts, padding="max_length", truncation=True, max_length=20)
-        longest = processor(prompts, padding="longest", truncation=True, max_length=30)
+        max_length = processor(text=prompts, padding="max_length", truncation=True, max_length=20)
+        longest = processor(text=prompts, padding="longest", truncation=True, max_length=30)
 
         decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1])
         decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1])
@@ -204,7 +212,143 @@ def test_model_input_names(self):
         processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor)
         prompts = self.prepare_prompts()
 
-        inputs = processor(prompts, padding="longest", return_tensors="pt")
+        inputs = processor(text=prompts, padding="longest", return_tensors="pt")
 
         # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask']
         self.assertSetEqual(set(inputs.keys()), set(self.input_keys))
+
+    # Override the following tests as Idefics image processor does not accept do_rescale and rescale_factor
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", image_size=234)
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", image_size=234)
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, image_size=224)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            image_size=214,
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs(batch_size=2)
+        image_input = self.prepare_image_inputs(batch_size=2)
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            image_size=214,
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 8)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"image_size": 214},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = self.prepare_text_inputs()
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"image_size": 214},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[3], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
diff --git a/tests/models/idefics2/test_processor_idefics2.py b/tests/models/idefics2/test_processor_idefics2.py
index 2fd569f99141af..bf713c6fb8cfbb 100644
--- a/tests/models/idefics2/test_processor_idefics2.py
+++ b/tests/models/idefics2/test_processor_idefics2.py
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import shutil
+import tempfile
 import unittest
 from io import BytesIO
+from typing import Optional
 
 import requests
 
@@ -22,16 +25,30 @@
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
 
+    from transformers import (
+        AutoProcessor,
+        Idefics2Processor,
+    )
+
 
 @require_torch
 @require_vision
-class Idefics2ProcessorTest(unittest.TestCase):
+class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = Idefics2Processor
+
     def setUp(self):
-        self.processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
+        self.tmpdirname = tempfile.mkdtemp()
+
+        processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
+
+        processor.save_pretrained(self.tmpdirname)
+
         self.image1 = Image.open(
             BytesIO(
                 requests.get(
@@ -49,22 +66,35 @@ def setUp(self):
                 ).content
             )
         )
-        self.bos_token = self.processor.tokenizer.bos_token
-        self.image_token = self.processor.image_token.content
-        self.fake_image_token = self.processor.fake_image_token.content
+        self.bos_token = processor.tokenizer.bos_token
+        self.image_token = processor.image_token.content
+        self.fake_image_token = processor.fake_image_token.content
+
+        self.bos_token_id = processor.tokenizer.convert_tokens_to_ids(self.bos_token)
+        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(self.image_token)
+        self.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
+        self.image_seq_len = processor.image_seq_len
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
-        self.bos_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.bos_token)
-        self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.image_token)
-        self.fake_image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
-        self.image_seq_len = self.processor.image_seq_len
+    def get_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
 
     def test_process_interleaved_images_prompts_no_image_splitting(self):
-        old_image_splitting = self.processor.image_processor.do_image_splitting
+        tokenizer = self.get_tokenizer()
+        processor = self.get_processor()
 
-        self.processor.image_processor.do_image_splitting = False
+        processor.image_processor.do_image_splitting = False
 
         # Test that a single image is processed correctly
-        inputs = self.processor(images=self.image1)
+        inputs = processor(images=self.image1)
         self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980))
         self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980))
         # fmt: on
@@ -73,10 +103,10 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         image_str = "<image>"
         text_str = "In this image, we see"
         text = image_str + text_str
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
 
         # fmt: off
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        tokenized_sentence = tokenizer(text_str, add_special_tokens=False)
         expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
@@ -95,11 +125,11 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         ]
         images = [[self.image1], [self.image2, self.image3]]
 
-        inputs = self.processor(text=text, images=images, padding=True)
+        inputs = processor(text=text, images=images, padding=True)
 
         # fmt: off
-        tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
-        tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
+        tokenized_sentence_1 = tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = tokenizer(text_str_2, add_special_tokens=False)
         expected_input_ids_1 = [self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
         expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
         # Pad the first input to match the second input
@@ -117,15 +147,13 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 767, 980))
         # fmt: on
 
-        self.processor.image_processor.do_image_splitting = old_image_splitting
-
     def test_process_interleaved_images_prompts_image_splitting(self):
-        old_image_splitting = self.processor.image_processor.do_image_splitting
-
-        self.processor.image_processor.do_image_splitting = True
+        processor = self.get_processor()
+        tokenizer = self.get_tokenizer()
+        processor.image_processor.do_image_splitting = True
 
         # Test that a single image is processed correctly
-        inputs = self.processor(images=self.image1)
+        inputs = processor(images=self.image1)
         self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980))
         self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980))
         # fmt: on
@@ -134,10 +162,10 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         image_str = "<image>"
         text_str = "In this image, we see"
         text = image_str + text_str
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
 
         # fmt: off
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        tokenized_sentence = tokenizer(text_str, add_special_tokens=False)
         expected_input_ids = [[self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
@@ -156,11 +184,11 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         ]
         images = [[self.image1], [self.image2, self.image3]]
 
-        inputs = self.processor(text=text, images=images, padding=True)
+        inputs = processor(text=text, images=images, padding=True)
 
         # fmt: off
-        tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False)
-        tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False)
+        tokenized_sentence_1 = tokenizer(text_str_1, add_special_tokens=False)
+        tokenized_sentence_2 = tokenizer(text_str_2, add_special_tokens=False)
         expected_input_ids_1 = [self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"]
         expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id]
         # Pad the first input to match the second input
@@ -178,22 +206,22 @@ def test_process_interleaved_images_prompts_image_splitting(self):
         self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 10, 767, 980))
         # fmt: on
 
-        self.processor.image_processor.do_image_splitting = old_image_splitting
-
     def test_add_special_tokens_processor(self):
+        processor = self.get_processor()
+        tokenizer = self.get_tokenizer()
         image_str = "<image>"
         text_str = "In this image, we see"
         text = text_str + image_str
 
-        n_image_repeat = 5 if self.processor.image_processor.do_image_splitting else 1
+        n_image_repeat = 5 if processor.image_processor.do_image_splitting else 1
 
         # fmt: off
-        inputs = self.processor(text=text, images=self.image1, add_special_tokens=False)
-        tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False)
+        inputs = processor(text=text, images=self.image1, add_special_tokens=False)
+        tokenized_sentence = tokenizer(text_str, add_special_tokens=False)
         expected_input_ids = [tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
 
-        inputs = self.processor(text=text, images=self.image1)
+        inputs = processor(text=text, images=self.image1)
         expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         # fmt: on
@@ -222,7 +250,7 @@ def test_apply_chat_template(self):
             {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]},
         ]
 
-        processor = self.processor
+        processor = self.get_processor()
         # Make short sequence length to test that the fake tokens are added correctly
         rendered = processor.apply_chat_template(messages, add_generation_prompt=True)
 
@@ -233,3 +261,27 @@ def test_apply_chat_template(self):
             "Assistant:"
         )
         self.assertEqual(rendered, expected_rendered)
+
+    # Override as Idefics2Processor needs image tokens in prompts
+    def prepare_text_inputs(self, batch_size: Optional[int] = None):
+        if batch_size is None:
+            return "lower newer <image>"
+
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+
+        if batch_size == 1:
+            return ["lower newer <image>"]
+        return ["lower newer <image>", "<image> upper older longer string"] + ["<image> lower newer"] * (
+            batch_size - 2
+        )
+
+    # Override as PixtralProcessor needs nested images to work properly with batched inputs
+    @require_vision
+    def prepare_image_inputs(self, batch_size: Optional[int] = None):
+        """This function prepares a list of PIL images for testing"""
+        if batch_size is None:
+            return super().prepare_image_inputs()
+        if batch_size < 1:
+            raise ValueError("batch_size must be greater than 0")
+        return [[super().prepare_image_inputs()]] * batch_size