From a94cf092fc16f15013d4d09f1293f4102b1a47c6 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 9 Aug 2024 00:23:48 +0000 Subject: [PATCH 1/6] Add uniformize idefics processor kwargs and tests --- .../models/idefics/processing_idefics.py | 157 +++++++++------ .../models/idefics2/processing_idefics2.py | 20 +- .../models/idefics/test_processor_idefics.py | 190 +++++++++++++++++- 3 files changed, 297 insertions(+), 70 deletions(-) diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 8e9e196764f923..49d322350d7b03 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -16,15 +16,23 @@ Processor class for IDEFICS. """ -from typing import Callable, List, Optional, Union +import sys +import warnings +from typing import List, Union from urllib.parse import urlparse from ...feature_extraction_utils import BatchFeature -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy +from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import is_tf_available, is_torch_available +from ...utils.deprecation import deprecate_kwarg +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + if is_torch_available(): import torch @@ -34,6 +42,16 @@ IMAGE_TOKEN = "" +class IdeficsProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": False, + "padding": "longest", + }, + "images_kwargs": {}, + } + + # copied from m4.training.packing def incremental_to_binary_attention_mask(incremental_mask, return_tensors, num_classes=-1): # Set elements >= num_classes to -1 @@ -199,55 +217,35 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u else False ) + @deprecate_kwarg( + old_name="transform", version="5.0.0", additional_message="Add kwargs to the image processor instead." + ) + @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True) def __call__( self, - prompts: Union[List[TextInput], List[List[TextInput]]], - padding: Union[bool, str, PaddingStrategy] = "longest", - truncation: Union[bool, str, TruncationStrategy] = None, - max_length: Optional[int] = None, - transform: Callable = None, - add_eos_token=False, - add_end_of_utterance_token=None, - debug=False, - return_tensors="pt", + images=None, + text: Union[ + TextInput, + PreTokenizedInput, + List[TextInput], + List[PreTokenizedInput], + List[List[TextInput]], + List[List[PreTokenizedInput]], + ] = None, + audio=None, + videos=None, + **kwargs: Unpack[IdeficsProcessorKwargs], ) -> BatchEncoding: """This method takes batched or non-batched prompts made of text and images and converts them into prompts that the model was trained on and prepares the image pixel values for the model to process. Args: - prompts (`Union[List[TextInput], [List[List[TextInput]]]]`): + text (`Union[List[TextInput], [List[List[TextInput]]]]`): either a single prompt or a batched list of prompts - see the detailed description immediately after the end of the arguments doc section. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `"longest"`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding - index) among: - - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single - sequence if provided). - - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum - acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'`: No padding. This will raise an error if the input sequences are of different - lengths. - Note: Unlike most processors, which set padding=`False` by default, `IdeficsProcessor` sets `padding="longest"` - by default. See https://github.com/huggingface/transformers/pull/29449#pullrequestreview-1925576061 for why. - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding length (see above). - truncation (`bool`, *optional*): - Activates truncation to cut input sequences longer than `max_length` to `max_length`. - transform (`Callable`, *optional*): - A custom transform function that accepts a single image can be passed for training. For example, - `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific - set of transforms will be applied to the images - add_eos_token (`bool`, *optional*, defaults to `False`): - Adds `eos_token` at the end of the final prompt if True` - add_end_of_utterance_token (`bool`, *optional*) - Whether to automatically add `` after each prompt's text input (unless followed by an - image). If `None` the tokenizer will be checked instead and if this token is found in - `additional_special_tokens` then the value will be `True`. - debug (`bool`, *optional*, defaults to `False`): - `True` value will help debug prompt generation by dumping useful information - return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`): - The type of tensors to return. Can be one of: - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + images (`Union[PIL.Image, str, List[PIL.Image], List[str]]`): + either a single image or a batched list of images - can be passed in when text contains only text prompts, + in order to use the image-text-to-text behavior. Returns: a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be @@ -255,7 +253,7 @@ def __call__( Detailed explanation: - Each entry in `prompts` is either a text to be passed as is or an image that will be processed. + Each entry in `text` is either a text to be passed as is or an image that will be processed. An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved. @@ -317,12 +315,60 @@ def __call__( In order to help debug prompt generation enable `debug=True` which will show you what's happening. """ + if images is None and text is None: + raise ValueError("You need to specify either `text` or `images` and `text`.") + # for BC + if text is None: + # if the user didn't specify text=text in the call, we assume they want to use the old behavior + # with text (previously prompts) as a first argument + warnings.warn( + "The use of `text` as the first argument will be deprecated in the future. `images` is now the first argument." + "The first given argument will be considered as `prompts` in the old behavior.", + ) + text = images + images = None + if images is None: + # assuming the user wants to use the old behavior with prompts as the only argument + prompts = text + elif text is not None: + # Assuming image-text-to-text behavior: + # Check if batched images are provided + if not isinstance(images, (list, tuple)): + images = [images] + if isinstance(text, str): + # one prompt for all images instead of one prompt per image + text = [text] * len(images) + # Check if batched text is provided + if isinstance(text, (list, tuple)) and len(text) != len(images): + raise ValueError( + "When using the image-text-to-text behavior, the number of prompts should be the same as the number of images." + ) + # Check that only text is present in the prompts + if not all(isinstance(i, str) for i in text): + raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.") + prompts = list(zip(images, text)) + + # for BC + transform = kwargs.pop("transform", None) + add_eos_token = kwargs.pop("add_eos_token", False) + add_end_of_utterance_token = kwargs.pop("add_end_of_utterance_token", None) + + # Temporary fix for "paddding_side" in init_kwargs + _ = self.tokenizer.init_kwargs.pop("padding_side", None) + + output_kwargs = self._merge_kwargs( + IdeficsProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if transform is not None: + output_kwargs["images_kwargs"]["transform"] = transform # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it if add_end_of_utterance_token is None: add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token # turn non-batched prompts into batched - if not any(isinstance(i, list) for i in prompts): + if not any(isinstance(i, (list, tuple)) for i in prompts): prompts = [prompts] fake_token = "" @@ -371,21 +417,14 @@ def image_tokens(last_was_image): if add_eos_token: full_text += self.tokenizer.eos_token - if debug is True: - print(f"{full_text=}") - - image_objects = self.image_processor(image_objects, transform=transform, return_tensors=return_tensors) + image_objects = self.image_processor(image_objects, **output_kwargs["images_kwargs"]) all_prompts.append(full_text) all_images.append(image_objects) - text_encoding = self.tokenizer( - text=all_prompts, - add_special_tokens=False, - padding=padding, - truncation=truncation, - max_length=max_length, - ) + # For BC + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", "pt") + text_encoding = self.tokenizer(all_prompts, **output_kwargs["text_kwargs"]) all_texts = text_encoding["input_ids"] all_attention_masks = text_encoding["attention_mask"] @@ -398,12 +437,12 @@ def image_tokens(last_was_image): output_images = [] output_attention_masks = [] - for text, attention_mask, images in zip(all_texts, all_attention_masks, all_images): - padded_input_ids = text + for text_single, attention_mask, extracted_images in zip(all_texts, all_attention_masks, all_images): + padded_input_ids = text_single image_count = padded_input_ids.count(self.image_token_id) local_max_num_images = min(image_count, max_num_images) - current_images = images[:local_max_num_images] + current_images = extracted_images[:local_max_num_images] if len(current_images) > 0: if return_tensors == "pt": diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index 2e14118144baaa..cc61c670cc49fc 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -20,7 +20,7 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, is_valid_image, load_image -from ...processing_utils import ProcessorMixin +from ...processing_utils import ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import AddedToken, BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy from ...utils import TensorType, logging @@ -40,6 +40,24 @@ def is_image_or_image_url(elem): return is_url(elem) or is_valid_image(elem) +class IdeficsProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "stride": 0, + "return_attention_mask": True, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_token_type_ids": False, + "return_length": False, + "verbose": True, + }, + "images_kwargs": {}, + } + + class Idefics2Processor(ProcessorMixin): r""" Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor. diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py index 26dcbb1c0f1566..31c9da7d750aa9 100644 --- a/tests/models/idefics/test_processor_idefics.py +++ b/tests/models/idefics/test_processor_idefics.py @@ -12,11 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +import shutil +import tempfile +import unittest + import numpy as np -from transformers.testing_utils import TestCasePlus, require_torch, require_vision +from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_torch_available(): import torch @@ -35,26 +41,29 @@ @require_torch @require_vision -class IdeficsProcessorTest(TestCasePlus): - def setUp(self): - super().setUp() +class IdeficsProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = IdeficsProcessor - self.checkpoint_path = self.get_auto_remove_tmp_dir() + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() image_processor = IdeficsImageProcessor(return_tensors="pt") tokenizer = LlamaTokenizerFast.from_pretrained("HuggingFaceM4/tiny-random-idefics") processor = IdeficsProcessor(image_processor, tokenizer) - processor.save_pretrained(self.checkpoint_path) + processor.save_pretrained(self.tmpdirname) self.input_keys = ["pixel_values", "input_ids", "attention_mask", "image_attention_mask"] def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).tokenizer + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.checkpoint_path, **kwargs).image_processor + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + def tearDown(self): + shutil.rmtree(self.tmpdirname) def prepare_prompts(self): """This function prepares a list of PIL images""" @@ -100,13 +109,13 @@ def prepare_prompts(self): def test_save_load_pretrained_additional_features(self): processor = IdeficsProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) - processor.save_pretrained(self.checkpoint_path) + processor.save_pretrained(self.tmpdirname) tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) processor = IdeficsProcessor.from_pretrained( - self.checkpoint_path, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 ) self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) @@ -208,3 +217,164 @@ def test_model_input_names(self): # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask'] self.assertSetEqual(set(inputs.keys()), set(self.input_keys)) + + @require_vision + @require_torch + def test_tokenizer_defaults_preserved_by_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(len(inputs["input_ids"][0]), 117) + + def test_image_processor_defaults_preserved_by_image_kwargs(self): + self.skipTest(reason="IdeficsImageProcessor kwargs are different from usual image processors") + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", image_size=234) + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234) + + @require_vision + @require_torch + def test_kwargs_overrides_default_tokenizer_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor( + text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" + ) + self.assertEqual(len(inputs["input_ids"][0]), 112) + + def test_kwargs_overrides_default_image_processor_kwargs(self): + self.skipTest(reason="IdeficsImageProcessor kwargs are different from usual image processors") + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", image_size=234) + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, image_size=224) + self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224) + + @require_torch + @require_vision + def test_unstructured_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + padding="max_length", + max_length=76, + ) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = ["lower newer", "upper older longer string"] + image_input = self.prepare_image_inputs() * 2 + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + padding="longest", + max_length=76, + ) + + self.assertEqual(len(inputs["input_ids"][0]), 8) + + @require_torch + @require_vision + def test_structured_kwargs_nested(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.skip_processor_without_typed_kwargs(processor) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_structured_kwargs_nested_from_dict(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + + self.assertEqual(len(inputs["input_ids"][0]), 76) From 8af76fad5f61c2a639c9631255982df483f29741 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 9 Aug 2024 16:06:38 +0000 Subject: [PATCH 2/6] Uniformize idefics2 processor kwargs --- .../models/idefics/processing_idefics.py | 33 ++- .../models/idefics2/processing_idefics2.py | 84 +++--- .../idefics2/test_processing_idefics2.py | 278 +++++++++++++++--- 3 files changed, 296 insertions(+), 99 deletions(-) diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 49d322350d7b03..e644d3356c01ef 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -18,11 +18,11 @@ import sys import warnings -from typing import List, Union +from typing import Callable, List, Optional, Union from urllib.parse import urlparse from ...feature_extraction_utils import BatchFeature -from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import is_tf_available, is_torch_available from ...utils.deprecation import deprecate_kwarg @@ -42,11 +42,23 @@ IMAGE_TOKEN = "" +class IdeficsImagesKwargs(ImagesKwargs, total=False): + transform: Optional[Callable] + + +class IdeficsTextKwargs(TextKwargs, total=False): + add_eos_token: Optional[bool] + add_end_of_utterance_token: Optional[bool] + + class IdeficsProcessorKwargs(ProcessingKwargs, total=False): + text_kwargs: IdeficsTextKwargs + images_kwargs: IdeficsImagesKwargs _defaults = { "text_kwargs": { "add_special_tokens": False, "padding": "longest", + "add_eos_token": False, }, "images_kwargs": {}, } @@ -217,9 +229,6 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u else False ) - @deprecate_kwarg( - old_name="transform", version="5.0.0", additional_message="Add kwargs to the image processor instead." - ) @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True) def __call__( self, @@ -277,7 +286,7 @@ def __call__( "Describe this image.\nAssistant:", ] - inputs = processor(prompts, return_tensors="pt") + inputs = processor(text=prompts, return_tensors="pt") generated_ids = model.generate(**inputs, max_length=100) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] ``` @@ -309,7 +318,7 @@ def __call__( transforms.Normalize(mean=self.image_mean, std=self.image_std), ] ) - inputs = processor(prompts, transform=image_transform, return_tensors="pt") + inputs = processor(text=prompts, transform=image_transform, return_tensors="pt") ``` In order to help debug prompt generation enable `debug=True` which will show you what's happening. @@ -348,11 +357,6 @@ def __call__( raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.") prompts = list(zip(images, text)) - # for BC - transform = kwargs.pop("transform", None) - add_eos_token = kwargs.pop("add_eos_token", False) - add_end_of_utterance_token = kwargs.pop("add_end_of_utterance_token", None) - # Temporary fix for "paddding_side" in init_kwargs _ = self.tokenizer.init_kwargs.pop("padding_side", None) @@ -361,8 +365,9 @@ def __call__( tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) - if transform is not None: - output_kwargs["images_kwargs"]["transform"] = transform + + add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False) + add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None) # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it if add_end_of_utterance_token is None: diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index cc61c670cc49fc..b42a121eda26dc 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -16,18 +16,24 @@ Processor class for IDEFICS2. """ +import sys from typing import TYPE_CHECKING, List, Optional, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, is_valid_image, load_image -from ...processing_utils import ProcessingKwargs, ProcessorMixin -from ...tokenization_utils_base import AddedToken, BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy -from ...utils import TensorType, logging +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput +from ...utils import logging if TYPE_CHECKING: from ...tokenization_utils_base import PreTokenizedInput +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + logger = logging.get_logger(__name__) @@ -40,19 +46,18 @@ def is_image_or_image_url(elem): return is_url(elem) or is_valid_image(elem) -class IdeficsProcessorKwargs(ProcessingKwargs, total=False): +class Idefics2ImagesKwargs(ImagesKwargs, total=False): + image_seq_len: Optional[int] + + +class Idefics2ProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: Idefics2ImagesKwargs + _defaults = { "text_kwargs": { "add_special_tokens": True, "padding": False, - "stride": 0, - "return_attention_mask": True, - "return_overflowing_tokens": False, - "return_special_tokens_mask": False, - "return_offsets_mapping": False, - "return_token_type_ids": False, - "return_length": False, - "verbose": True, + "is_split_into_words": False, }, "images_kwargs": {}, } @@ -115,15 +120,11 @@ def _extract_images_from_prompts(self, prompts): def __call__( self, - text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None, images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None, - image_seq_len: Optional[int] = None, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length: Optional[int] = None, - is_split_into_words: bool = False, - add_special_tokens: bool = True, - return_tensors: Optional[Union[str, TensorType]] = None, + text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None, + audio=None, + videos=None, + **kwargs: Unpack[Idefics2ProcessorKwargs], ) -> BatchEncoding: """ Processes the input prompts and returns a BatchEncoding. @@ -156,6 +157,9 @@ def __call__( ``` Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1. text (`Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set @@ -163,27 +167,15 @@ def __call__( Wherever an image token, `` is encountered it is expanded to `` + `` * `image_seq_len` * `. - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1. - image_seq_len (`int`, *optional*): - The length of the image sequence. If not provided, the default value is used. - padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `False`): - Padding strategy applied to the input ids. See [`PreTrainedTokenizerFast.pad`] for more information. - truncation (`Union[bool, str, TruncationStrategy]`, *optional*): - Truncation strategy applied to the input ids. See [`PreTrainedTokenizerFast.truncate`] for more information. - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding/truncation length. See - [`PreTrainedTokenizerFast.__call__`] for more information. - is_split_into_words (`bool`, *optional*, defaults to `False`): - Whether the input text is split into words or not. If set to `True`, the tokenizer will skip the - tokenization process and assume the input is already tokenized. - add_special_tokens (`bool`, *optional*, defaults to `True`): - Whether to add special tokens or not. See [`PreTrainedTokenizerFast.__call__`] for more information. - return_tensors (`Union[str, TensorType]`, *optional*): - If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more - information. + """ + + output_kwargs = self._merge_kwargs( + Idefics2ProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None) image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len n_images_in_text = [] @@ -212,15 +204,7 @@ def __call__( sample = sample.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}") prompt_strings.append(sample) - text_inputs = self.tokenizer( - text=prompt_strings, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - is_split_into_words=is_split_into_words, - return_tensors=return_tensors, - ) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) inputs.update(text_inputs) if images is not None: @@ -245,7 +229,7 @@ def __call__( # Load images if they are URLs images = [[load_image(im) for im in sample] for sample in images] - image_inputs = self.image_processor(images, return_tensors=return_tensors) + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) inputs.update(image_inputs) return inputs diff --git a/tests/models/idefics2/test_processing_idefics2.py b/tests/models/idefics2/test_processing_idefics2.py index 2fd569f99141af..e8292807626bf2 100644 --- a/tests/models/idefics2/test_processing_idefics2.py +++ b/tests/models/idefics2/test_processing_idefics2.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import shutil +import tempfile import unittest from io import BytesIO @@ -22,16 +24,30 @@ from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_vision_available(): from PIL import Image + from transformers import ( + AutoProcessor, + Idefics2Processor, + ) + @require_torch @require_vision -class Idefics2ProcessorTest(unittest.TestCase): +class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = Idefics2Processor + def setUp(self): - self.processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2) + self.tmpdirname = tempfile.mkdtemp() + + processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2) + + processor.save_pretrained(self.tmpdirname) + self.image1 = Image.open( BytesIO( requests.get( @@ -49,22 +65,35 @@ def setUp(self): ).content ) ) - self.bos_token = self.processor.tokenizer.bos_token - self.image_token = self.processor.image_token.content - self.fake_image_token = self.processor.fake_image_token.content + self.bos_token = processor.tokenizer.bos_token + self.image_token = processor.image_token.content + self.fake_image_token = processor.fake_image_token.content + + self.bos_token_id = processor.tokenizer.convert_tokens_to_ids(self.bos_token) + self.image_token_id = processor.tokenizer.convert_tokens_to_ids(self.image_token) + self.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(self.fake_image_token) + self.image_seq_len = processor.image_seq_len + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - self.bos_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.bos_token) - self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.image_token) - self.fake_image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.fake_image_token) - self.image_seq_len = self.processor.image_seq_len + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + def get_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) def test_process_interleaved_images_prompts_no_image_splitting(self): - old_image_splitting = self.processor.image_processor.do_image_splitting + tokenizer = self.get_tokenizer() + processor = self.get_processor() - self.processor.image_processor.do_image_splitting = False + processor.image_processor.do_image_splitting = False # Test that a single image is processed correctly - inputs = self.processor(images=self.image1) + inputs = processor(images=self.image1) self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980)) self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980)) # fmt: on @@ -73,10 +102,10 @@ def test_process_interleaved_images_prompts_no_image_splitting(self): image_str = "" text_str = "In this image, we see" text = image_str + text_str - inputs = self.processor(text=text, images=self.image1) + inputs = processor(text=text, images=self.image1) # fmt: off - tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False) + tokenized_sentence = tokenizer(text_str, add_special_tokens=False) expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]] self.assertEqual(inputs["input_ids"], expected_input_ids) self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) @@ -95,11 +124,11 @@ def test_process_interleaved_images_prompts_no_image_splitting(self): ] images = [[self.image1], [self.image2, self.image3]] - inputs = self.processor(text=text, images=images, padding=True) + inputs = processor(text=text, images=images, padding=True) # fmt: off - tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False) - tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False) + tokenized_sentence_1 = tokenizer(text_str_1, add_special_tokens=False) + tokenized_sentence_2 = tokenizer(text_str_2, add_special_tokens=False) expected_input_ids_1 = [self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"] expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] # Pad the first input to match the second input @@ -117,15 +146,13 @@ def test_process_interleaved_images_prompts_no_image_splitting(self): self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 767, 980)) # fmt: on - self.processor.image_processor.do_image_splitting = old_image_splitting - def test_process_interleaved_images_prompts_image_splitting(self): - old_image_splitting = self.processor.image_processor.do_image_splitting - - self.processor.image_processor.do_image_splitting = True + processor = self.get_processor() + tokenizer = self.get_tokenizer() + processor.image_processor.do_image_splitting = True # Test that a single image is processed correctly - inputs = self.processor(images=self.image1) + inputs = processor(images=self.image1) self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980)) self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980)) # fmt: on @@ -134,10 +161,10 @@ def test_process_interleaved_images_prompts_image_splitting(self): image_str = "" text_str = "In this image, we see" text = image_str + text_str - inputs = self.processor(text=text, images=self.image1) + inputs = processor(text=text, images=self.image1) # fmt: off - tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False) + tokenized_sentence = tokenizer(text_str, add_special_tokens=False) expected_input_ids = [[self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence["input_ids"]] self.assertEqual(inputs["input_ids"], expected_input_ids) self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) @@ -156,11 +183,11 @@ def test_process_interleaved_images_prompts_image_splitting(self): ] images = [[self.image1], [self.image2, self.image3]] - inputs = self.processor(text=text, images=images, padding=True) + inputs = processor(text=text, images=images, padding=True) # fmt: off - tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False) - tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False) + tokenized_sentence_1 = tokenizer(text_str_1, add_special_tokens=False) + tokenized_sentence_2 = tokenizer(text_str_2, add_special_tokens=False) expected_input_ids_1 = [self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"] expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] # Pad the first input to match the second input @@ -178,22 +205,22 @@ def test_process_interleaved_images_prompts_image_splitting(self): self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 10, 767, 980)) # fmt: on - self.processor.image_processor.do_image_splitting = old_image_splitting - def test_add_special_tokens_processor(self): + processor = self.get_processor() + tokenizer = self.get_tokenizer() image_str = "" text_str = "In this image, we see" text = text_str + image_str - n_image_repeat = 5 if self.processor.image_processor.do_image_splitting else 1 + n_image_repeat = 5 if processor.image_processor.do_image_splitting else 1 # fmt: off - inputs = self.processor(text=text, images=self.image1, add_special_tokens=False) - tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False) + inputs = processor(text=text, images=self.image1, add_special_tokens=False) + tokenized_sentence = tokenizer(text_str, add_special_tokens=False) expected_input_ids = [tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]] self.assertEqual(inputs["input_ids"], expected_input_ids) - inputs = self.processor(text=text, images=self.image1) + inputs = processor(text=text, images=self.image1) expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]] self.assertEqual(inputs["input_ids"], expected_input_ids) # fmt: on @@ -222,7 +249,7 @@ def test_apply_chat_template(self): {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]}, ] - processor = self.processor + processor = self.get_processor() # Make short sequence length to test that the fake tokens are added correctly rendered = processor.apply_chat_template(messages, add_generation_prompt=True) @@ -233,3 +260,184 @@ def test_apply_chat_template(self): "Assistant:" ) self.assertEqual(rendered, expected_rendered) + + @require_vision + @require_torch + def test_tokenizer_defaults_preserved_by_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + image_str = "" + input_str = image_str + "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(len(inputs["input_ids"][0]), 117) + + @require_torch + @require_vision + def test_image_processor_defaults_preserved_by_image_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", size={"height": 234, "width": 234}) + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + image_str = "" + input_str = image_str + "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234) + + @require_vision + @require_torch + def test_kwargs_overrides_default_tokenizer_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + image_str = "" + input_str = image_str + "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor( + text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" + ) + self.assertEqual(len(inputs["input_ids"][0]), 112) + + @require_torch + @require_vision + def test_kwargs_overrides_default_image_processor_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", size={"height": 234, "width": 234}) + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + image_str = "" + input_str = image_str + "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, size={"height": 224, "width": 224}) + self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224) + + @require_torch + @require_vision + def test_unstructured_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + image_str = "" + input_str = image_str + "lower newer" + image_input = self.prepare_image_inputs() + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + size={"height": 214, "width": 214}, + padding="max_length", + max_length=76, + ) + + self.assertEqual(inputs["pixel_values"].shape[3], 214) + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + image_str = "" + input_str = [image_str + "lower newer", image_str + "upper older longer string"] + image_input = [self.prepare_image_inputs()] * 2 + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + size={"height": 214, "width": 214}, + padding="longest", + max_length=76, + ) + + self.assertEqual(inputs["pixel_values"].shape[3], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 21) + + @require_torch + @require_vision + def test_structured_kwargs_nested(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + image_str = "" + input_str = image_str + "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"size": {"height": 214, "width": 214}}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.skip_processor_without_typed_kwargs(processor) + + self.assertEqual(inputs["pixel_values"].shape[3], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_structured_kwargs_nested_from_dict(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + image_str = "" + input_str = image_str + "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"size": {"height": 214, "width": 214}}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.assertEqual(inputs["pixel_values"].shape[3], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) From e6747ff8b403d143b0d56892368fdb426a550dbb Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Mon, 12 Aug 2024 22:21:54 +0000 Subject: [PATCH 3/6] add image_processor tests idefics --- .../models/idefics/processing_idefics.py | 6 +++++- tests/models/idefics/test_processor_idefics.py | 12 ++++++++---- tests/models/idefics2/test_processing_idefics2.py | 4 ---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index e644d3356c01ef..3a7fcc80ffb225 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -18,7 +18,7 @@ import sys import warnings -from typing import Callable, List, Optional, Union +from typing import Callable, Dict, List, Optional, Union from urllib.parse import urlparse from ...feature_extraction_utils import BatchFeature @@ -44,6 +44,9 @@ class IdeficsImagesKwargs(ImagesKwargs, total=False): transform: Optional[Callable] + image_size: Optional[Dict[str, int]] + image_mean: Optional[Union[float, List[float]]] + image_std: Optional[Union[float, List[float]]] class IdeficsTextKwargs(TextKwargs, total=False): @@ -61,6 +64,7 @@ class IdeficsProcessorKwargs(ProcessingKwargs, total=False): "add_eos_token": False, }, "images_kwargs": {}, + "common_kwargs": {"return_tensors": "pt"}, } diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py index 31c9da7d750aa9..bdb5554b9402d7 100644 --- a/tests/models/idefics/test_processor_idefics.py +++ b/tests/models/idefics/test_processor_idefics.py @@ -235,7 +235,6 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): self.assertEqual(len(inputs["input_ids"][0]), 117) def test_image_processor_defaults_preserved_by_image_kwargs(self): - self.skipTest(reason="IdeficsImageProcessor kwargs are different from usual image processors") if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor", image_size=234) @@ -269,7 +268,6 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): self.assertEqual(len(inputs["input_ids"][0]), 112) def test_kwargs_overrides_default_image_processor_kwargs(self): - self.skipTest(reason="IdeficsImageProcessor kwargs are different from usual image processors") if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor", image_size=234) @@ -301,10 +299,12 @@ def test_unstructured_kwargs(self): text=input_str, images=image_input, return_tensors="pt", + image_size=214, padding="max_length", max_length=76, ) + self.assertEqual(inputs["pixel_values"].shape[3], 214) self.assertEqual(len(inputs["input_ids"][0]), 76) @require_torch @@ -324,10 +324,12 @@ def test_unstructured_kwargs_batched(self): text=input_str, images=image_input, return_tensors="pt", + image_size=214, padding="longest", max_length=76, ) + self.assertEqual(inputs["pixel_values"].shape[3], 214) self.assertEqual(len(inputs["input_ids"][0]), 8) @require_torch @@ -347,12 +349,13 @@ def test_structured_kwargs_nested(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"image_size": 214}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } inputs = processor(text=input_str, images=image_input, **all_kwargs) self.skip_processor_without_typed_kwargs(processor) - + self.assertEqual(inputs["pixel_values"].shape[3], 214) self.assertEqual(len(inputs["input_ids"][0]), 76) @require_torch @@ -372,9 +375,10 @@ def test_structured_kwargs_nested_from_dict(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"image_size": 214}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } inputs = processor(text=input_str, images=image_input, **all_kwargs) - + self.assertEqual(inputs["pixel_values"].shape[3], 214) self.assertEqual(len(inputs["input_ids"][0]), 76) diff --git a/tests/models/idefics2/test_processing_idefics2.py b/tests/models/idefics2/test_processing_idefics2.py index e8292807626bf2..25ab3bd67c5f4e 100644 --- a/tests/models/idefics2/test_processing_idefics2.py +++ b/tests/models/idefics2/test_processing_idefics2.py @@ -383,7 +383,6 @@ def test_unstructured_kwargs_batched(self): ) self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 21) @require_torch @@ -410,9 +409,7 @@ def test_structured_kwargs_nested(self): inputs = processor(text=input_str, images=image_input, **all_kwargs) self.skip_processor_without_typed_kwargs(processor) - self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) @require_torch @@ -439,5 +436,4 @@ def test_structured_kwargs_nested_from_dict(self): inputs = processor(text=input_str, images=image_input, **all_kwargs) self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) From 1a231cbfabede4c2390e54a2f6bae3958c2b4190 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Mon, 12 Aug 2024 22:34:32 +0000 Subject: [PATCH 4/6] add BC args order change idefics2 processor and update doc --- .../models/idefics2/modeling_idefics2.py | 2 +- .../models/idefics2/processing_idefics2.py | 21 ++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index cdc7e9ba4e77b0..dca3b177cd3fc1 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -1558,7 +1558,7 @@ def forward( ... "In which city is that bridge located?", ... ] >>> images = [[image1, image2], [image3]] - >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to("cuda") + >>> inputs = processor(images=images, text=prompts, padding=True, return_tensors="pt").to("cuda") >>> # Generate >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20) diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index b42a121eda26dc..96b7c7601cac12 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -17,6 +17,7 @@ """ import sys +import warnings from typing import TYPE_CHECKING, List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -149,7 +150,7 @@ def __call__( ... "In this image, we see", ... "bla bla bla", ... ] - >>> outputs = processor(text=text, images=images, return_tensors="pt", padding=True) + >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True) >>> input_ids = outputs.input_ids >>> input_tokens = processor.tokenizer.batch_decode(input_ids) >>> print(input_tokens) @@ -169,6 +170,24 @@ def __call__( `` + `` * `image_seq_len` * `. """ + if text is None and images is None: + raise ValueError("You must provide either `text` or `images`.") + # check if images and text inputs are reversed for BC + if ( + text is not None + and not isinstance(text[0], str) + or images is not None + and not ( + is_image_or_image_url(images) + or is_image_or_image_url(images[0]) + or (isinstance(images[0], list) and is_image_or_image_url(images[0][0])) + ) + ): + warnings.warn( + "It looks like you are passing the inputs in the wrong order. You should pass the images input first and the text input second." + "Images and text inputs will be swapped." + ) + images, text = text, images output_kwargs = self._merge_kwargs( Idefics2ProcessorKwargs, From 8b171a777bac10bbb9c9a13bd36d6ffd10be9b9d Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Wed, 14 Aug 2024 14:13:17 +0000 Subject: [PATCH 5/6] Add support for multiple images per prompt in image-text-to-text mode idefics --- .../models/idefics/processing_idefics.py | 16 +++++++++------- .../idefics/test_image_processing_idefics.py | 1 + tests/models/idefics/test_processor_idefics.py | 15 +++++++-------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 3a7fcc80ffb225..7f6ff773d8ee5b 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -349,26 +349,28 @@ def __call__( if not isinstance(images, (list, tuple)): images = [images] if isinstance(text, str): - # one prompt for all images instead of one prompt per image - text = [text] * len(images) - # Check if batched text is provided + text = [text] + # Check if batched images and text are in the correct format if isinstance(text, (list, tuple)) and len(text) != len(images): raise ValueError( - "When using the image-text-to-text behavior, the number of prompts should be the same as the number of images." + "When providing both images and text arguments, the number of text prompts should be the same as the number of images." + "If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...]." ) # Check that only text is present in the prompts if not all(isinstance(i, str) for i in text): raise ValueError("When using the image-text-to-text behavior, the prompts should only contain text.") + if isinstance(images[0], (list, tuple)): + # if nested images, nest text as well + text = [[i] for i in text] prompts = list(zip(images, text)) - # Temporary fix for "paddding_side" in init_kwargs - _ = self.tokenizer.init_kwargs.pop("padding_side", None) - output_kwargs = self._merge_kwargs( IdeficsProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) + # Temporary fix for "paddding_side" in init_kwargs + _ = output_kwargs["text_kwargs"].pop("padding_side", None) add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False) add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None) diff --git a/tests/models/idefics/test_image_processing_idefics.py b/tests/models/idefics/test_image_processing_idefics.py index 2f7a8993df5348..b567d97a13be67 100644 --- a/tests/models/idefics/test_image_processing_idefics.py +++ b/tests/models/idefics/test_image_processing_idefics.py @@ -49,6 +49,7 @@ def __init__( image_mean=[0.48145466, 0.4578275, 0.40821073], image_std=[0.26862954, 0.26130258, 0.27577711], ): + super().__init__() size = size if size is not None else {"shortest_edge": 30} self.parent = parent self.batch_size = batch_size diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py index bdb5554b9402d7..e658be7ac9f26a 100644 --- a/tests/models/idefics/test_processor_idefics.py +++ b/tests/models/idefics/test_processor_idefics.py @@ -18,6 +18,13 @@ import numpy as np +from transformers import ( + AutoProcessor, + IdeficsImageProcessor, + IdeficsProcessor, + LlamaTokenizerFast, + PreTrainedTokenizerFast, +) from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available @@ -30,14 +37,6 @@ if is_vision_available(): from PIL import Image - from transformers import ( - AutoProcessor, - IdeficsImageProcessor, - IdeficsProcessor, - LlamaTokenizerFast, - PreTrainedTokenizerFast, - ) - @require_torch @require_vision From 747fbe1f38050c063a23e6928a45e37ab2338bf2 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Thu, 15 Aug 2024 23:32:22 +0000 Subject: [PATCH 6/6] Fix processor input args in idefics tests --- tests/models/idefics/test_modeling_idefics.py | 2 +- tests/models/idefics/test_processor_idefics.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 0197ebcaff5388..a49bce8d878fb4 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -662,7 +662,7 @@ def test_inference_natural_language_visual_reasoning(self): "HuggingFaceM4/idefics-9b", quantization_config=quantization_config, device_map="auto" ) processor = self.default_processor - inputs = processor(prompts, return_tensors="pt", padding="longest").to(torch_device) + inputs = processor(text=prompts, return_tensors="pt", padding="longest").to(torch_device) generated_ids = model.generate(**inputs, max_length=100) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py index e658be7ac9f26a..c88030775b0143 100644 --- a/tests/models/idefics/test_processor_idefics.py +++ b/tests/models/idefics/test_processor_idefics.py @@ -132,7 +132,7 @@ def test_processor(self): prompts = self.prepare_prompts() # test that all prompts succeeded - input_processor = processor(prompts, return_tensors="pt", padding="longest") + input_processor = processor(text=prompts, return_tensors="pt", padding="longest") for key in self.input_keys: assert torch.is_tensor(input_processor[key]) @@ -165,8 +165,8 @@ def test_tokenizer_padding(self): ] prompts = [[prompt] for prompt in self.prepare_prompts()[2]] - max_length = processor(prompts, padding="max_length", truncation=True, max_length=20, return_tensors="pt") - longest = processor(prompts, padding="longest", truncation=True, max_length=30, return_tensors="pt") + max_length = processor(text=prompts, padding="max_length", truncation=True, max_length=20, return_tensors="pt") + longest = processor(text=prompts, padding="longest", truncation=True, max_length=30, return_tensors="pt") decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1]) decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1]) @@ -193,8 +193,8 @@ def test_tokenizer_left_padding(self): ([0] * 10) + ([1] * 10), ] prompts = [[prompt] for prompt in self.prepare_prompts()[2]] - max_length = processor(prompts, padding="max_length", truncation=True, max_length=20) - longest = processor(prompts, padding="longest", truncation=True, max_length=30) + max_length = processor(text=prompts, padding="max_length", truncation=True, max_length=20) + longest = processor(text=prompts, padding="longest", truncation=True, max_length=30) decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1]) decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1]) @@ -212,7 +212,7 @@ def test_model_input_names(self): processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor) prompts = self.prepare_prompts() - inputs = processor(prompts, padding="longest", return_tensors="pt") + inputs = processor(text=prompts, padding="longest", return_tensors="pt") # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask'] self.assertSetEqual(set(inputs.keys()), set(self.input_keys))