diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index cdc7e9ba4e77b0..dca3b177cd3fc1 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1558,7 +1558,7 @@ def forward( ... "In which city is that bridge located?", ... ] >>> images = [[image1, image2], [image3]] - >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to("cuda") + >>> inputs = processor(images=images, text=prompts, padding=True, return_tensors="pt").to("cuda") >>> # Generate >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20) diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index b42a121eda26dc..96b7c7601cac12 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -17,6 +17,7 @@ """ import sys +import warnings from typing import TYPE_CHECKING, List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -149,7 +150,7 @@ def __call__( ... "In this image, we see", ... "bla bla bla", ... ] - >>> outputs = processor(text=text, images=images, return_tensors="pt", padding=True) + >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True) >>> input_ids = outputs.input_ids >>> input_tokens = processor.tokenizer.batch_decode(input_ids) >>> print(input_tokens) @@ -169,6 +170,24 @@ def __call__( `` + `` * `image_seq_len` * `. """ + if text is None and images is None: + raise ValueError("You must provide either `text` or `images`.") + # check if images and text inputs are reversed for BC + if ( + text is not None + and not isinstance(text[0], str) + or images is not None + and not ( + is_image_or_image_url(images) + or is_image_or_image_url(images[0]) + or (isinstance(images[0], list) and is_image_or_image_url(images[0][0])) + ) + ): + warnings.warn( + "It looks like you are passing the inputs in the wrong order. You should pass the images input first and the text input second." + "Images and text inputs will be swapped." + ) + images, text = text, images output_kwargs = self._merge_kwargs( Idefics2ProcessorKwargs,