diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index cdc7e9ba4e77b0..dca3b177cd3fc1 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -1558,7 +1558,7 @@ def forward(
         ...   "In which city is that bridge located?<image>",
         ... ]
         >>> images = [[image1, image2], [image3]]
-        >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to("cuda")
+        >>> inputs = processor(images=images, text=prompts, padding=True, return_tensors="pt").to("cuda")
 
         >>> # Generate
         >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20)
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index b42a121eda26dc..96b7c7601cac12 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -17,6 +17,7 @@
 """
 
 import sys
+import warnings
 from typing import TYPE_CHECKING, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
@@ -149,7 +150,7 @@ def __call__(
         ...     "<image>In this image, we see",
         ...     "bla bla bla<image>",
         ... ]
-        >>> outputs = processor(text=text, images=images, return_tensors="pt", padding=True)
+        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
         >>> input_ids = outputs.input_ids
         >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
         >>> print(input_tokens)
@@ -169,6 +170,24 @@ def __call__(
                 `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
 
         """
+        if text is None and images is None:
+            raise ValueError("You must provide either `text` or `images`.")
+        # check if images and text inputs are reversed for BC
+        if (
+            text is not None
+            and not isinstance(text[0], str)
+            or images is not None
+            and not (
+                is_image_or_image_url(images)
+                or is_image_or_image_url(images[0])
+                or (isinstance(images[0], list) and is_image_or_image_url(images[0][0]))
+            )
+        ):
+            warnings.warn(
+                "It looks like you are passing the inputs in the wrong order. You should pass the images input first and the text input second."
+                "Images and text inputs will be swapped."
+            )
+            images, text = text, images
 
         output_kwargs = self._merge_kwargs(
             Idefics2ProcessorKwargs,