Skip to content

Commit

Permalink
Add support for multiple images per prompt in image-text-to-text mode…
Browse files Browse the repository at this point in the history
… idefics
  • Loading branch information
yonigozlan committed Aug 14, 2024
1 parent 1a231cb commit 8b171a7
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 15 deletions.
16 changes: 9 additions & 7 deletions src/transformers/models/idefics/processing_idefics.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,26 +349,28 @@ def __call__(
if not isinstance(images, (list, tuple)):
images = [images]
if isinstance(text, str):
# one prompt for all images instead of one prompt per image
text = [text] * len(images)
# Check if batched text is provided
text = [text]
# Check if batched images and text are in the correct format
if isinstance(text, (list, tuple)) and len(text) != len(images):
raise ValueError(
"When using the image-text-to-text behavior, the number of prompts should be the same as the number of images."
"When providing both images and text arguments, the number of text prompts should be the same as the number of images."
"If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...]."
)
# Check that only text is present in the prompts
if not all(isinstance(i, str) for i in text):
raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.")
if isinstance(images[0], (list, tuple)):
# if nested images, nest text as well
text = [[i] for i in text]
prompts = list(zip(images, text))

# Temporary fix for "paddding_side" in init_kwargs
_ = self.tokenizer.init_kwargs.pop("padding_side", None)

output_kwargs = self._merge_kwargs(
IdeficsProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
# Temporary fix for "paddding_side" in init_kwargs
_ = output_kwargs["text_kwargs"].pop("padding_side", None)

add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None)
Expand Down
1 change: 1 addition & 0 deletions tests/models/idefics/test_image_processing_idefics.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(
image_mean=[0.48145466, 0.4578275, 0.40821073],
image_std=[0.26862954, 0.26130258, 0.27577711],
):
super().__init__()
size = size if size is not None else {"shortest_edge": 30}
self.parent = parent
self.batch_size = batch_size
Expand Down
15 changes: 7 additions & 8 deletions tests/models/idefics/test_processor_idefics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@

import numpy as np

from transformers import (
AutoProcessor,
IdeficsImageProcessor,
IdeficsProcessor,
LlamaTokenizerFast,
PreTrainedTokenizerFast,
)
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available

Expand All @@ -30,14 +37,6 @@
if is_vision_available():
from PIL import Image

from transformers import (
AutoProcessor,
IdeficsImageProcessor,
IdeficsProcessor,
LlamaTokenizerFast,
PreTrainedTokenizerFast,
)


@require_torch
@require_vision
Expand Down

0 comments on commit 8b171a7

Please sign in to comment.