From 8b171a777bac10bbb9c9a13bd36d6ffd10be9b9d Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Wed, 14 Aug 2024 14:13:17 +0000
Subject: [PATCH] Add support for multiple images per prompt in
 image-text-to-text mode idefics

---
 .../models/idefics/processing_idefics.py         | 16 +++++++++-------
 .../idefics/test_image_processing_idefics.py     |  1 +
 tests/models/idefics/test_processor_idefics.py   | 15 +++++++--------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 3a7fcc80ffb225..7f6ff773d8ee5b 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -349,26 +349,28 @@ def __call__(
             if not isinstance(images, (list, tuple)):
                 images = [images]
             if isinstance(text, str):
-                # one prompt for all images instead of one prompt per image
-                text = [text] * len(images)
-            # Check if batched text is provided
+                text = [text]
+            # Check if batched images and text are in the correct format
             if isinstance(text, (list, tuple)) and len(text) != len(images):
                 raise ValueError(
-                    "When using the image-text-to-text behavior, the number of prompts should be the same as the number of images."
+                    "When providing both images and text arguments, the number of text prompts should be the same as the number of images."
+                    "If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...]."
                 )
             # Check that only text is present in the prompts
             if not all(isinstance(i, str) for i in text):
                 raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.")
+            if isinstance(images[0], (list, tuple)):
+                # if nested images, nest text as well
+                text = [[i] for i in text]
             prompts = list(zip(images, text))
 
-        # Temporary fix for "paddding_side" in init_kwargs
-        _ = self.tokenizer.init_kwargs.pop("padding_side", None)
-
         output_kwargs = self._merge_kwargs(
             IdeficsProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+        # Temporary fix for "paddding_side" in init_kwargs
+        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
 
         add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
         add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None)
diff --git a/tests/models/idefics/test_image_processing_idefics.py b/tests/models/idefics/test_image_processing_idefics.py
index 2f7a8993df5348..b567d97a13be67 100644
--- a/tests/models/idefics/test_image_processing_idefics.py
+++ b/tests/models/idefics/test_image_processing_idefics.py
@@ -49,6 +49,7 @@ def __init__(
         image_mean=[0.48145466, 0.4578275, 0.40821073],
         image_std=[0.26862954, 0.26130258, 0.27577711],
     ):
+        super().__init__()
         size = size if size is not None else {"shortest_edge": 30}
         self.parent = parent
         self.batch_size = batch_size
diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py
index bdb5554b9402d7..e658be7ac9f26a 100644
--- a/tests/models/idefics/test_processor_idefics.py
+++ b/tests/models/idefics/test_processor_idefics.py
@@ -18,6 +18,13 @@
 
 import numpy as np
 
+from transformers import (
+    AutoProcessor,
+    IdeficsImageProcessor,
+    IdeficsProcessor,
+    LlamaTokenizerFast,
+    PreTrainedTokenizerFast,
+)
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -30,14 +37,6 @@
 if is_vision_available():
     from PIL import Image
 
-    from transformers import (
-        AutoProcessor,
-        IdeficsImageProcessor,
-        IdeficsProcessor,
-        LlamaTokenizerFast,
-        PreTrainedTokenizerFast,
-    )
-
 
 @require_torch
 @require_vision