diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 3a7fcc80ffb225..7f6ff773d8ee5b 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -349,26 +349,28 @@ def __call__( if not isinstance(images, (list, tuple)): images = [images] if isinstance(text, str): - # one prompt for all images instead of one prompt per image - text = [text] * len(images) - # Check if batched text is provided + text = [text] + # Check if batched images and text are in the correct format if isinstance(text, (list, tuple)) and len(text) != len(images): raise ValueError( - "When using the image-text-to-text behavior, the number of prompts should be the same as the number of images." + "When providing both images and text arguments, the number of text prompts should be the same as the number of images." + "If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...]." ) # Check that only text is present in the prompts if not all(isinstance(i, str) for i in text): raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.") + if isinstance(images[0], (list, tuple)): + # if nested images, nest text as well + text = [[i] for i in text] prompts = list(zip(images, text)) - # Temporary fix for "paddding_side" in init_kwargs - _ = self.tokenizer.init_kwargs.pop("padding_side", None) - output_kwargs = self._merge_kwargs( IdeficsProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) + # Temporary fix for "paddding_side" in init_kwargs + _ = output_kwargs["text_kwargs"].pop("padding_side", None) add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False) add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None) diff --git a/tests/models/idefics/test_image_processing_idefics.py b/tests/models/idefics/test_image_processing_idefics.py index 2f7a8993df5348..b567d97a13be67 100644 --- a/tests/models/idefics/test_image_processing_idefics.py +++ b/tests/models/idefics/test_image_processing_idefics.py @@ -49,6 +49,7 @@ def __init__( image_mean=[0.48145466, 0.4578275, 0.40821073], image_std=[0.26862954, 0.26130258, 0.27577711], ): + super().__init__() size = size if size is not None else {"shortest_edge": 30} self.parent = parent self.batch_size = batch_size diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py index bdb5554b9402d7..e658be7ac9f26a 100644 --- a/tests/models/idefics/test_processor_idefics.py +++ b/tests/models/idefics/test_processor_idefics.py @@ -18,6 +18,13 @@ import numpy as np +from transformers import ( + AutoProcessor, + IdeficsImageProcessor, + IdeficsProcessor, + LlamaTokenizerFast, + PreTrainedTokenizerFast, +) from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available @@ -30,14 +37,6 @@ if is_vision_available(): from PIL import Image - from transformers import ( - AutoProcessor, - IdeficsImageProcessor, - IdeficsProcessor, - LlamaTokenizerFast, - PreTrainedTokenizerFast, - ) - @require_torch @require_vision