From 508e1a47084ef0941a131dd3a49bf9540a1d1b35 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Fri, 16 Aug 2024 16:21:31 +0800 Subject: [PATCH 01/10] uniformize processor kwargs of nougat --- .../models/nougat/processing_nougat.py | 180 +++++++++++------- 1 file changed, 107 insertions(+), 73 deletions(-) diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py index 8f94c6718ba660..ac7aef12d6199c 100644 --- a/src/transformers/models/nougat/processing_nougat.py +++ b/src/transformers/models/nougat/processing_nougat.py @@ -16,12 +16,52 @@ Processor class for Nougat. """ -from typing import Dict, List, Optional, Union +import sys +import warnings +from typing import List, Optional, Union -from transformers.tokenization_utils_base import PreTokenizedInput, TextInput, TruncationStrategy +from ...image_utils import ImageInput +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs +from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...processing_utils import ProcessorMixin -from ...utils import PaddingStrategy, TensorType + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class NougatTextKwargs(TextKwargs, total=False): + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] + text_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] + text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] + + +class NougatImagesKwargs(ImagesKwargs, total=False): + do_crop_margin: Optional[bool] + do_thumbnail: Optional[bool] + do_align_long_axis: Optional[bool] + + +class NougatProcessorKwargs(ProcessingKwargs, total=False): + text_kwargs: NougatTextKwargs + images_kwargs: NougatImagesKwargs + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "stride": 0, + "is_split_into_words": False, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_length": False, + "verbose": True, + }, + "images_kwargs": { + "data_format": "channels_first", + }, + } class NougatProcessor(ProcessorMixin): @@ -48,86 +88,80 @@ def __init__(self, image_processor, tokenizer): def __call__( self, - images=None, - text=None, - do_crop_margin: bool = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: "PILImageResampling" = None, # noqa: F821 - do_thumbnail: bool = None, - do_align_long_axis: bool = None, - do_pad: bool = None, - do_rescale: bool = None, - rescale_factor: Union[int, float] = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821 - input_data_format: Optional[Union[str, "ChannelDimension"]] = None, # noqa: F821 - text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, - text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - text_pair_target: Optional[ - Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] - ] = None, - add_special_tokens: bool = True, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length: Optional[int] = None, - stride: int = 0, - is_split_into_words: bool = False, - pad_to_multiple_of: Optional[int] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - return_token_type_ids: Optional[bool] = None, - return_attention_mask: Optional[bool] = None, - return_overflowing_tokens: bool = False, - return_special_tokens_mask: bool = False, - return_offsets_mapping: bool = False, - return_length: bool = False, - verbose: bool = True, + images: Optional[ImageInput] = None, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + audio=None, + videos=None, + backwards_compatibility_placeholder_arg=None, + **kwargs: Unpack[NougatProcessorKwargs], ): if images is None and text is None: raise ValueError("You need to specify either an `images` or `text` input to process.") - if images is not None: - inputs = self.image_processor( - images, - do_crop_margin=do_crop_margin, - do_resize=do_resize, - size=size, - resample=resample, - do_thumbnail=do_thumbnail, - do_align_long_axis=do_align_long_axis, - do_pad=do_pad, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - return_tensors=return_tensors, - data_format=data_format, - input_data_format=input_data_format, + output_kwargs = self._merge_kwargs( + NougatProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if output_kwargs["text_kwargs"].get("text_pair") is not None and audio is not None: + raise ValueError( + "You cannot provide `text_pair` as a positional argument and as a keyword argument at the same time." + "Please provide it only as a keyword argument (i.e. `text_pair=...`)." + ) + if "text_pair" not in output_kwargs["text_kwargs"]: + warnings.warn( + "No `text_pair` kwarg was detected. The use of `text_pair` as an argument without specifying it explicitely as `text_pair=` will be deprecated in future versions." ) + # For backwards compatibility, we reuse `audio` as `text_pair` in case + # downstream users passed it as a positional argument + if audio is not None: + output_kwargs["text_kwargs"]["text_pair"] = audio + + if output_kwargs["text_kwargs"].get("text_target") is not None and videos is not None: + raise ValueError( + "You cannot provide `text_target` as a positional argument and as a keyword argument at the same time." + "Please provide it only as a keyword argument (i.e. `text_target=...`)." + ) + if "text_target" not in output_kwargs["text_kwargs"]: + warnings.warn( + "No `text_target` kwarg was detected. The use of `text_target` as an argument without specifying it explicitely as `text_target=` will be deprecated in future versions." + ) + # For backwards compatibility, we reuse `videos` as `text_target` in case + # downstream users passed it as a positional argument + if videos is not None: + output_kwargs["text_kwargs"]["text_target"] = videos + + if ( + output_kwargs["text_kwargs"].get("text_pair_target") is not None + and backwards_compatibility_placeholder_arg is not None + ): + raise ValueError( + "You cannot provide `text_pair_target` as a positional argument and as a keyword argument at the same time." + "Please provide it only as a keyword argument (i.e. `text_pair_target=...`)." + ) + if "text_pair_target" not in output_kwargs["text_kwargs"]: + warnings.warn( + "No `text_pair_target` kwarg was detected. The use of `text_pair_target` as an argument without specifying it explicitely as `text_pair_target=` will be deprecated in future versions." + ) + # For backwards compatibility, we reuse `backwards_compatibility_placeholder_arg` as `text_pair_target` in case + # downstream users passed it as a positional argument + if backwards_compatibility_placeholder_arg is not None: + output_kwargs["text_kwargs"]["text_pair_target"] = backwards_compatibility_placeholder_arg + + text_pair = output_kwargs["text_kwargs"].pop("text_pair", None) + text_target = output_kwargs["text_kwargs"].pop("text_target", None) + text_pair_target = output_kwargs["text_kwargs"].pop("text_pair_target", None) + + if images is not None: + inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is not None: encodings = self.tokenizer( text, text_pair=text_pair, text_target=text_target, text_pair_target=text_pair_target, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - is_split_into_words=is_split_into_words, - pad_to_multiple_of=pad_to_multiple_of, - return_tensors=return_tensors, - return_token_type_ids=return_token_type_ids, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_length=return_length, - verbose=verbose, + **output_kwargs["text_kwargs"], ) if text is None: From 257c690fe4cfb71cdeb4c2c3ec53db2176030343 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Fri, 16 Aug 2024 18:30:21 +0800 Subject: [PATCH 02/10] add tests and more docs --- .../models/donut/image_processing_donut.py | 2 ++ .../models/fuyu/image_processing_fuyu.py | 3 ++- .../models/nougat/image_processing_nougat.py | 2 ++ .../models/nougat/processing_nougat.py | 14 +++++----- .../models/sam/image_processing_sam.py | 1 + .../models/tvp/image_processing_tvp.py | 1 + tests/models/nougat/test_processor_nougat.py | 17 ++++++++++++ tests/test_processing_common.py | 26 ++++++++++--------- 8 files changed, 47 insertions(+), 19 deletions(-) create mode 100644 tests/models/nougat/test_processor_nougat.py diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py index edb0629d44bd04..b56db329420460 100644 --- a/src/transformers/models/donut/image_processing_donut.py +++ b/src/transformers/models/donut/image_processing_donut.py @@ -183,6 +183,7 @@ def pad_image( input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ + size = get_size_dict(size) output_height, output_width = size["height"], size["width"] input_height, input_width = get_image_size(image, channel_dim=input_data_format) @@ -232,6 +233,7 @@ def thumbnail( The channel dimension format of the input image. If not provided, it will be inferred. """ input_height, input_width = get_image_size(image, channel_dim=input_data_format) + size = get_size_dict(size) output_height, output_width = size["height"], size["width"] # We always resize to the smallest of either the input or output size. diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py index 255922b8308889..19eb1d0e7e3e77 100644 --- a/src/transformers/models/fuyu/image_processing_fuyu.py +++ b/src/transformers/models/fuyu/image_processing_fuyu.py @@ -19,7 +19,7 @@ import numpy as np -from ...image_processing_utils import BaseImageProcessor, BatchFeature +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import ( pad, resize, @@ -344,6 +344,7 @@ def pad_image( The channel dimension format of the input image. If not provided, it will be inferred. """ image_height, image_width = get_image_size(image, input_data_format) + size = get_size_dict(size) target_height, target_width = size["height"], size["width"] padding_top = 0 padding_left = 0 diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py index 792f4a14325a0a..ff8090964e26ef 100644 --- a/src/transformers/models/nougat/image_processing_nougat.py +++ b/src/transformers/models/nougat/image_processing_nougat.py @@ -250,6 +250,7 @@ def pad_image( input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ + size = get_size_dict(size) output_height, output_width = size["height"], size["width"] input_height, input_width = get_image_size(image, channel_dim=input_data_format) @@ -292,6 +293,7 @@ def thumbnail( The channel dimension format of the input image. If not provided, it will be inferred. """ input_height, input_width = get_image_size(image, channel_dim=input_data_format) + size = get_size_dict(size) output_height, output_width = size["height"], size["width"] # We always resize to the smallest of either the input or output size. diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py index ac7aef12d6199c..f63fcb4082f5f5 100644 --- a/src/transformers/models/nougat/processing_nougat.py +++ b/src/transformers/models/nougat/processing_nougat.py @@ -103,7 +103,11 @@ def __call__( tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) + # Temporary fix for "paddding_side" in init_kwargs + _ = output_kwargs["text_kwargs"].pop("padding_side", None) + # For backwards compatibility, we reuse `audio` as `text_pair` + # in case downstream users passed it as a positional argument if output_kwargs["text_kwargs"].get("text_pair") is not None and audio is not None: raise ValueError( "You cannot provide `text_pair` as a positional argument and as a keyword argument at the same time." @@ -113,11 +117,11 @@ def __call__( warnings.warn( "No `text_pair` kwarg was detected. The use of `text_pair` as an argument without specifying it explicitely as `text_pair=` will be deprecated in future versions." ) - # For backwards compatibility, we reuse `audio` as `text_pair` in case - # downstream users passed it as a positional argument if audio is not None: output_kwargs["text_kwargs"]["text_pair"] = audio + # For backwards compatibility, we reuse `videos` as `text_target` + # in case downstream users passed it as a positional argument if output_kwargs["text_kwargs"].get("text_target") is not None and videos is not None: raise ValueError( "You cannot provide `text_target` as a positional argument and as a keyword argument at the same time." @@ -127,11 +131,11 @@ def __call__( warnings.warn( "No `text_target` kwarg was detected. The use of `text_target` as an argument without specifying it explicitely as `text_target=` will be deprecated in future versions." ) - # For backwards compatibility, we reuse `videos` as `text_target` in case - # downstream users passed it as a positional argument if videos is not None: output_kwargs["text_kwargs"]["text_target"] = videos + # For backwards compatibility, we reuse `backwards_compatibility_placeholder_arg` as `text_pair_target` + # in case downstream users passed it as a positional argument if ( output_kwargs["text_kwargs"].get("text_pair_target") is not None and backwards_compatibility_placeholder_arg is not None @@ -144,8 +148,6 @@ def __call__( warnings.warn( "No `text_pair_target` kwarg was detected. The use of `text_pair_target` as an argument without specifying it explicitely as `text_pair_target=` will be deprecated in future versions." ) - # For backwards compatibility, we reuse `backwards_compatibility_placeholder_arg` as `text_pair_target` in case - # downstream users passed it as a positional argument if backwards_compatibility_placeholder_arg is not None: output_kwargs["text_kwargs"]["text_pair_target"] = backwards_compatibility_placeholder_arg diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py index beea3f4b01c311..ff86fdeb577e08 100644 --- a/src/transformers/models/sam/image_processing_sam.py +++ b/src/transformers/models/sam/image_processing_sam.py @@ -185,6 +185,7 @@ def pad_image( input_data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ + pad_size = get_size_dict(pad_size) output_height, output_width = pad_size["height"], pad_size["width"] input_height, input_width = get_image_size(image, channel_dim=input_data_format) diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 100ec133e8b026..dfa4902c9a442b 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -244,6 +244,7 @@ def pad_image( The channel dimension format of the input image. If not provided, it will be inferred. """ height, width = get_image_size(image, channel_dim=input_data_format) + pad_size = get_size_dict(pad_size) max_height = pad_size.get("height", height) max_width = pad_size.get("width", width) diff --git a/tests/models/nougat/test_processor_nougat.py b/tests/models/nougat/test_processor_nougat.py new file mode 100644 index 00000000000000..ca512684cc68bd --- /dev/null +++ b/tests/models/nougat/test_processor_nougat.py @@ -0,0 +1,17 @@ +import tempfile +import unittest + +from transformers import NougatProcessor + +from ...test_processing_common import ProcessorTesterMixin + + +class NougatProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "facebook/nougat-base" + text_data_arg_name = "labels" + processor_class = NougatProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index a30c6363b9d7ff..577341fe531b6e 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -48,6 +48,8 @@ @require_vision @require_torch class ProcessorTesterMixin: + image_data_arg_name = "pixel_values" + text_data_arg_name = "input_ids" processor_class = None def prepare_processor_dict(self): @@ -136,7 +138,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(len(inputs["input_ids"][0]), 117) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 117) @require_torch @require_vision @@ -153,7 +155,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs["pixel_values"][0][0]), 234) + self.assertEqual(len(inputs[self.image_data_arg_name][0][0]), 234) @require_vision @require_torch @@ -171,7 +173,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): inputs = processor( text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" ) - self.assertEqual(len(inputs["input_ids"][0]), 112) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 112) @require_torch @require_vision @@ -188,7 +190,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self): image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, size=[224, 224]) - self.assertEqual(len(inputs["pixel_values"][0][0]), 224) + self.assertEqual(len(inputs[self.image_data_arg_name][0][0]), 224) @require_torch @require_vision @@ -212,8 +214,8 @@ def test_unstructured_kwargs(self): max_length=76, ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) + self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) @require_torch @require_vision @@ -237,9 +239,9 @@ def test_unstructured_kwargs_batched(self): max_length=76, ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) + self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 6) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 6) @require_torch @require_vision @@ -286,9 +288,9 @@ def test_structured_kwargs_nested(self): inputs = processor(text=input_str, images=image_input, **all_kwargs) self.skip_processor_without_typed_kwargs(processor) - self.assertEqual(inputs["pixel_values"].shape[2], 214) + self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) @require_torch @require_vision @@ -312,9 +314,9 @@ def test_structured_kwargs_nested_from_dict(self): } inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[2], 214) + self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) class MyProcessor(ProcessorMixin): From 93e70702a952d546a08d15ed52a67ac33114e083 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Sat, 17 Aug 2024 14:17:29 +0800 Subject: [PATCH 03/10] add uniformization of processor kwargs of processors with special keys here --- .../models/clipseg/processing_clipseg.py | 72 +++++++++++++---- .../models/owlv2/image_processing_owlv2.py | 3 +- .../models/owlv2/processing_owlv2.py | 80 +++++++++++++++---- .../models/owlvit/processing_owlvit.py | 79 ++++++++++++++---- .../models/clipseg/test_processor_clipseg.py | 10 ++- tests/models/owlv2/test_processor_owlv2.py | 18 +++++ tests/models/owlvit/test_processor_owlvit.py | 10 ++- 7 files changed, 217 insertions(+), 55 deletions(-) create mode 100644 tests/models/owlv2/test_processor_owlv2.py diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index f8eaca82334a22..bbec55fabf99f7 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -16,10 +16,28 @@ Image/Text processor class for CLIPSeg """ +import sys import warnings +from typing import List, Optional, Union -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding +from ...image_utils import ImageInput +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class CLIPSegImagesKwargs(ImagesKwargs, total=False): + visual_prompt: Optional[ImageInput] + + +class CLIPSegProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: CLIPSegImagesKwargs + _defaults = {} class CLIPSegProcessor(ProcessorMixin): @@ -58,7 +76,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) - def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=None, **kwargs): + def __call__( + self, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[CLIPSegProcessorKwargs], + ): """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode @@ -79,14 +104,6 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. - Returns: [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: @@ -96,6 +113,29 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + + output_kwargs = self._merge_kwargs( + CLIPSegProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if output_kwargs["images_kwargs"].get("visual_prompt") is not None and audio is not None: + raise ValueError( + "You cannot provide `visual_prompt` as a positional argument and as a keyword argument at the same time." + "Please provide it only as a keyword argument (i.e. `visual_prompt=...`)." + ) + if "visual_prompt" not in output_kwargs["images_kwargs"]: + warnings.warn( + "No `visual_prompt` kwarg was detected. The use of `visual_prompt` as an argument without specifying it explicitely as `visual_prompt=` will be deprecated in future versions." + ) + # For backwards compatibility, we reuse `audio` as `visual_prompt` in case + # downstream users passed it as a positional argument + if audio is not None: + output_kwargs["images_kwargs"]["visual_prompt"] = audio + + visual_prompt = output_kwargs["images_kwargs"].pop("visual_prompt", None) + if text is None and visual_prompt is None and images is None: raise ValueError("You have to specify either text, visual prompt or images.") @@ -103,13 +143,13 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No raise ValueError("You have to specify exactly one type of prompt. Either text or visual prompt.") if text is not None: - encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) + encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) if visual_prompt is not None: - prompt_features = self.image_processor(visual_prompt, return_tensors=return_tensors, **kwargs) + prompt_features = self.image_processor(visual_prompt, **output_kwargs["images_kwargs"]) if images is not None: - image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) if visual_prompt is not None and images is not None: encoding = { @@ -128,7 +168,9 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No } return encoding else: - return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + return BatchEncoding( + data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"].get("return_tensors") + ) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py index dd32dc9f141183..eca806af296316 100644 --- a/src/transformers/models/owlv2/image_processing_owlv2.py +++ b/src/transformers/models/owlv2/image_processing_owlv2.py @@ -19,7 +19,7 @@ import numpy as np -from ...image_processing_utils import BaseImageProcessor, BatchFeature +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import ( center_to_corners_format, pad, @@ -296,6 +296,7 @@ def resize( """ requires_backends(self, "scipy") + size = get_size_dict(size) output_shape = (size["height"], size["width"]) image = to_channel_dimension_format(image, ChannelDimension.LAST) image, output_shape = _preprocess_resize_output_shape(image, output_shape) diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index 8b580ca5026618..dc8fefd434762b 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -16,15 +16,40 @@ Image/Text processor class for OWLv2 """ -from typing import List +import sys +import warnings +from typing import List, Optional, Union import numpy as np -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding +from ...image_utils import ImageInput +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import is_flax_available, is_tf_available, is_torch_available +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class Owlv2ImagesKwargs(ImagesKwargs, total=False): + query_images: Optional[ImageInput] + + +class Owlv2ProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: Owlv2ImagesKwargs + _defaults = { + "text_kwargs": { + "padding": "max_length", + }, + "common_kwargs": { + "return_tensors": "np", + }, + } + + class Owlv2Processor(ProcessorMixin): r""" Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into @@ -45,8 +70,14 @@ class Owlv2Processor(ProcessorMixin): def __init__(self, image_processor, tokenizer, **kwargs): super().__init__(image_processor, tokenizer) - # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OWLViT->OWLv2 - def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs): + def __call__( + self, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[Owlv2ProcessorKwargs], + ): """ Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode: @@ -67,12 +98,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt The query image to be prepared, one query image is expected per target image to be queried. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. + Returns: [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. @@ -81,6 +107,28 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + output_kwargs = self._merge_kwargs( + Owlv2ProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if output_kwargs["images_kwargs"].get("query_images") is not None and audio is not None: + raise ValueError( + "You cannot provide `query_images` as a positional argument and as a keyword argument at the same time." + "Please provide it only as a keyword argument (i.e. `query_images=...`)." + ) + if "query_images" not in output_kwargs["images_kwargs"]: + warnings.warn( + "No `query_images` kwarg was detected. The use of `query_images` as an argument without specifying it explicitely as `query_images=` will be deprecated in future versions." + ) + # For backwards compatibility, we reuse `audio` as `query_images` in case + # downstream users passed it as a positional argument + if audio is not None: + output_kwargs["images_kwargs"]["query_images"] = audio + + query_images = output_kwargs["images_kwargs"].pop("query_images", None) + return_tensors = output_kwargs["common_kwargs"]["return_tensors"] if text is None and query_images is None and images is None: raise ValueError( @@ -89,7 +137,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt if text is not None: if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)): - encodings = [self.tokenizer(text, padding=padding, return_tensors=return_tensors, **kwargs)] + encodings = [self.tokenizer(text, **output_kwargs["text_kwargs"])] elif isinstance(text, List) and isinstance(text[0], List): encodings = [] @@ -102,7 +150,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt if len(t) != max_num_queries: t = t + [" "] * (max_num_queries - len(t)) - encoding = self.tokenizer(t, padding=padding, return_tensors=return_tensors, **kwargs) + encoding = self.tokenizer(t, **output_kwargs["text_kwargs"]) encodings.append(encoding) else: raise TypeError("Input text should be a string, a list of strings or a nested list of strings") @@ -138,13 +186,11 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt if query_images is not None: encoding = BatchEncoding() - query_pixel_values = self.image_processor( - query_images, return_tensors=return_tensors, **kwargs - ).pixel_values + query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values encoding["query_pixel_values"] = query_pixel_values if images is not None: - image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index 2c7d490104bdfc..d6f8389b94c4b9 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -16,16 +16,40 @@ Image/Text processor class for OWL-ViT """ +import sys import warnings -from typing import List +from typing import List, Optional, Union import numpy as np -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding +from ...image_utils import ImageInput +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import is_flax_available, is_tf_available, is_torch_available +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class OwlViTImagesKwargs(ImagesKwargs, total=False): + query_images: Optional[ImageInput] + + +class OwlViTProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: OwlViTImagesKwargs + _defaults = { + "text_kwargs": { + "padding": "max_length", + }, + "common_kwargs": { + "return_tensors": "np", + }, + } + + class OwlViTProcessor(ProcessorMixin): r""" Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] @@ -61,7 +85,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) - def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs): + def __call__( + self, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[OwlViTProcessorKwargs], + ): """ Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode: @@ -82,12 +113,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt The query image to be prepared, one query image is expected per target image to be queried. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. + Returns: [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. @@ -97,6 +123,29 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + output_kwargs = self._merge_kwargs( + OwlViTProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if output_kwargs["images_kwargs"].get("query_images") is not None and audio is not None: + raise ValueError( + "You cannot provide `query_images` as a positional argument and as a keyword argument at the same time." + "Please provide it only as a keyword argument (i.e. `query_images=...`)." + ) + if "query_images" not in output_kwargs["images_kwargs"]: + warnings.warn( + "No `query_images` kwarg was detected. The use of `query_images` as an argument without specifying it explicitely as `query_images=` will be deprecated in future versions." + ) + # For backwards compatibility, we reuse `audio` as `query_images` in case + # downstream users passed it as a positional argument + if audio is not None: + output_kwargs["images_kwargs"]["query_images"] = audio + + query_images = output_kwargs["images_kwargs"].pop("query_images", None) + return_tensors = output_kwargs["common_kwargs"]["return_tensors"] + if text is None and query_images is None and images is None: raise ValueError( "You have to specify at least one text or query image or image. All three cannot be none." @@ -104,7 +153,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt if text is not None: if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)): - encodings = [self.tokenizer(text, padding=padding, return_tensors=return_tensors, **kwargs)] + encodings = [self.tokenizer(text, **output_kwargs["text_kwargs"])] elif isinstance(text, List) and isinstance(text[0], List): encodings = [] @@ -117,7 +166,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt if len(t) != max_num_queries: t = t + [" "] * (max_num_queries - len(t)) - encoding = self.tokenizer(t, padding=padding, return_tensors=return_tensors, **kwargs) + encoding = self.tokenizer(t, **output_kwargs["text_kwargs"]) encodings.append(encoding) else: raise TypeError("Input text should be a string, a list of strings or a nested list of strings") @@ -153,13 +202,11 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt if query_images is not None: encoding = BatchEncoding() - query_pixel_values = self.image_processor( - query_images, return_tensors=return_tensors, **kwargs - ).pixel_values + query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values encoding["query_pixel_values"] = query_pixel_values if images is not None: - image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py index e33049b2768fe4..bfc7d59c1b0ff3 100644 --- a/tests/models/clipseg/test_processor_clipseg.py +++ b/tests/models/clipseg/test_processor_clipseg.py @@ -21,20 +21,24 @@ import numpy as np import pytest -from transformers import CLIPTokenizer, CLIPTokenizerFast +from transformers import CLIPSegProcessor, CLIPTokenizer, CLIPTokenizerFast from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_vision_available(): from PIL import Image - from transformers import CLIPSegProcessor, ViTImageProcessor + from transformers import ViTImageProcessor @require_vision -class CLIPSegProcessorTest(unittest.TestCase): +class CLIPSegProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = CLIPSegProcessor + def setUp(self): self.tmpdirname = tempfile.mkdtemp() diff --git a/tests/models/owlv2/test_processor_owlv2.py b/tests/models/owlv2/test_processor_owlv2.py new file mode 100644 index 00000000000000..b8f8b5d26cfe41 --- /dev/null +++ b/tests/models/owlv2/test_processor_owlv2.py @@ -0,0 +1,18 @@ +import tempfile +import unittest + +from transformers import Owlv2Processor +from transformers.testing_utils import require_scipy + +from ...test_processing_common import ProcessorTesterMixin + + +@require_scipy +class Owlv2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "google/owlv2-base-patch16-ensemble" + processor_class = Owlv2Processor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) diff --git a/tests/models/owlvit/test_processor_owlvit.py b/tests/models/owlvit/test_processor_owlvit.py index b271c8880bfddc..3fadfac0046002 100644 --- a/tests/models/owlvit/test_processor_owlvit.py +++ b/tests/models/owlvit/test_processor_owlvit.py @@ -21,20 +21,24 @@ import numpy as np import pytest -from transformers import CLIPTokenizer, CLIPTokenizerFast +from transformers import CLIPTokenizer, CLIPTokenizerFast, OwlViTProcessor from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_vision_available(): from PIL import Image - from transformers import OwlViTImageProcessor, OwlViTProcessor + from transformers import OwlViTImageProcessor @require_vision -class OwlViTProcessorTest(unittest.TestCase): +class OwlViTProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = OwlViTProcessor + def setUp(self): self.tmpdirname = tempfile.mkdtemp() From 0128d199daaf90cb196ffcf82256b6df8d3364f3 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Sat, 17 Aug 2024 17:39:46 +0800 Subject: [PATCH 04/10] refactor how we handle arguments passed as positional args --- .../models/clipseg/processing_clipseg.py | 28 +--- .../models/nougat/processing_nougat.py | 68 +-------- .../models/owlv2/processing_owlv2.py | 28 ++-- .../models/owlvit/processing_owlvit.py | 27 ++-- src/transformers/processing_utils.py | 62 ++++++++ .../models/clipseg/test_processor_clipseg.py | 17 +++ tests/test_processing_common.py | 141 +++++++++++------- 7 files changed, 197 insertions(+), 174 deletions(-) diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index bbec55fabf99f7..7219fc7d4831ea 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -21,7 +21,7 @@ from typing import List, Optional, Union from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin +from ...processing_utils import ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput @@ -31,12 +31,7 @@ from typing_extensions import Unpack -class CLIPSegImagesKwargs(ImagesKwargs, total=False): - visual_prompt: Optional[ImageInput] - - class CLIPSegProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: CLIPSegImagesKwargs _defaults = {} @@ -57,6 +52,8 @@ class CLIPSegProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] image_processor_class = "ViTImageProcessor" tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") + # For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details. + optional_call_args = ["visual_prompt"] def __init__(self, image_processor=None, tokenizer=None, **kwargs): feature_extractor = None @@ -80,6 +77,10 @@ def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, images: Optional[ImageInput] = None, + # The following is to capture `visual_prompt` argument that may be passed as a positional argument. + # See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details. + # This behavior is only needed for backward compatibility and will be removed in future versions. + *args, audio=None, videos=None, **kwargs: Unpack[CLIPSegProcessorKwargs], @@ -118,22 +119,9 @@ def __call__( CLIPSegProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, + **self.prepare_and_validate_optional_call_args(*args), ) - if output_kwargs["images_kwargs"].get("visual_prompt") is not None and audio is not None: - raise ValueError( - "You cannot provide `visual_prompt` as a positional argument and as a keyword argument at the same time." - "Please provide it only as a keyword argument (i.e. `visual_prompt=...`)." - ) - if "visual_prompt" not in output_kwargs["images_kwargs"]: - warnings.warn( - "No `visual_prompt` kwarg was detected. The use of `visual_prompt` as an argument without specifying it explicitely as `visual_prompt=` will be deprecated in future versions." - ) - # For backwards compatibility, we reuse `audio` as `visual_prompt` in case - # downstream users passed it as a positional argument - if audio is not None: - output_kwargs["images_kwargs"]["visual_prompt"] = audio - visual_prompt = output_kwargs["images_kwargs"].pop("visual_prompt", None) if text is None and visual_prompt is None and images is None: diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py index f63fcb4082f5f5..cbb43449dbe355 100644 --- a/src/transformers/models/nougat/processing_nougat.py +++ b/src/transformers/models/nougat/processing_nougat.py @@ -17,11 +17,10 @@ """ import sys -import warnings from typing import List, Optional, Union from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -31,12 +30,6 @@ from typing_extensions import Unpack -class NougatTextKwargs(TextKwargs, total=False): - text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] - text_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] - text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] - - class NougatImagesKwargs(ImagesKwargs, total=False): do_crop_margin: Optional[bool] do_thumbnail: Optional[bool] @@ -44,7 +37,6 @@ class NougatImagesKwargs(ImagesKwargs, total=False): class NougatProcessorKwargs(ProcessingKwargs, total=False): - text_kwargs: NougatTextKwargs images_kwargs: NougatImagesKwargs _defaults = { "text_kwargs": { @@ -92,7 +84,6 @@ def __call__( text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, audio=None, videos=None, - backwards_compatibility_placeholder_arg=None, **kwargs: Unpack[NougatProcessorKwargs], ): if images is None and text is None: @@ -106,65 +97,10 @@ def __call__( # Temporary fix for "paddding_side" in init_kwargs _ = output_kwargs["text_kwargs"].pop("padding_side", None) - # For backwards compatibility, we reuse `audio` as `text_pair` - # in case downstream users passed it as a positional argument - if output_kwargs["text_kwargs"].get("text_pair") is not None and audio is not None: - raise ValueError( - "You cannot provide `text_pair` as a positional argument and as a keyword argument at the same time." - "Please provide it only as a keyword argument (i.e. `text_pair=...`)." - ) - if "text_pair" not in output_kwargs["text_kwargs"]: - warnings.warn( - "No `text_pair` kwarg was detected. The use of `text_pair` as an argument without specifying it explicitely as `text_pair=` will be deprecated in future versions." - ) - if audio is not None: - output_kwargs["text_kwargs"]["text_pair"] = audio - - # For backwards compatibility, we reuse `videos` as `text_target` - # in case downstream users passed it as a positional argument - if output_kwargs["text_kwargs"].get("text_target") is not None and videos is not None: - raise ValueError( - "You cannot provide `text_target` as a positional argument and as a keyword argument at the same time." - "Please provide it only as a keyword argument (i.e. `text_target=...`)." - ) - if "text_target" not in output_kwargs["text_kwargs"]: - warnings.warn( - "No `text_target` kwarg was detected. The use of `text_target` as an argument without specifying it explicitely as `text_target=` will be deprecated in future versions." - ) - if videos is not None: - output_kwargs["text_kwargs"]["text_target"] = videos - - # For backwards compatibility, we reuse `backwards_compatibility_placeholder_arg` as `text_pair_target` - # in case downstream users passed it as a positional argument - if ( - output_kwargs["text_kwargs"].get("text_pair_target") is not None - and backwards_compatibility_placeholder_arg is not None - ): - raise ValueError( - "You cannot provide `text_pair_target` as a positional argument and as a keyword argument at the same time." - "Please provide it only as a keyword argument (i.e. `text_pair_target=...`)." - ) - if "text_pair_target" not in output_kwargs["text_kwargs"]: - warnings.warn( - "No `text_pair_target` kwarg was detected. The use of `text_pair_target` as an argument without specifying it explicitely as `text_pair_target=` will be deprecated in future versions." - ) - if backwards_compatibility_placeholder_arg is not None: - output_kwargs["text_kwargs"]["text_pair_target"] = backwards_compatibility_placeholder_arg - - text_pair = output_kwargs["text_kwargs"].pop("text_pair", None) - text_target = output_kwargs["text_kwargs"].pop("text_target", None) - text_pair_target = output_kwargs["text_kwargs"].pop("text_pair_target", None) - if images is not None: inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is not None: - encodings = self.tokenizer( - text, - text_pair=text_pair, - text_target=text_target, - text_pair_target=text_pair_target, - **output_kwargs["text_kwargs"], - ) + encodings = self.tokenizer(text, **output_kwargs["text_kwargs"]) if text is None: return inputs diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index dc8fefd434762b..36af9c29c69da7 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -17,7 +17,6 @@ """ import sys -import warnings from typing import List, Optional, Union import numpy as np @@ -66,6 +65,8 @@ class Owlv2Processor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] image_processor_class = "Owlv2ImageProcessor" tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") + # For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details. + optional_call_args = ["query_images"] def __init__(self, image_processor, tokenizer, **kwargs): super().__init__(image_processor, tokenizer) @@ -74,6 +75,10 @@ def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, images: Optional[ImageInput] = None, + # The following is to capture `visual_prompt` argument that may be passed as a positional argument. + # See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details. + # This behavior is only needed for backward compatibility and will be removed in future versions. + *args, audio=None, videos=None, **kwargs: Unpack[Owlv2ProcessorKwargs], @@ -86,15 +91,15 @@ def __call__( of the above two methods for more information. Args: - text (`str`, `List[str]`, `List[List[str]]`): + text (`str`, `List[str]`, `List[List[str]]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, - `List[torch.Tensor]`): + `List[torch.Tensor]`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. - query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*): The query image to be prepared, one query image is expected per target image to be queried. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. @@ -111,22 +116,9 @@ def __call__( Owlv2ProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, + **self.prepare_and_validate_optional_call_args(*args), ) - if output_kwargs["images_kwargs"].get("query_images") is not None and audio is not None: - raise ValueError( - "You cannot provide `query_images` as a positional argument and as a keyword argument at the same time." - "Please provide it only as a keyword argument (i.e. `query_images=...`)." - ) - if "query_images" not in output_kwargs["images_kwargs"]: - warnings.warn( - "No `query_images` kwarg was detected. The use of `query_images` as an argument without specifying it explicitely as `query_images=` will be deprecated in future versions." - ) - # For backwards compatibility, we reuse `audio` as `query_images` in case - # downstream users passed it as a positional argument - if audio is not None: - output_kwargs["images_kwargs"]["query_images"] = audio - query_images = output_kwargs["images_kwargs"].pop("query_images", None) return_tensors = output_kwargs["common_kwargs"]["return_tensors"] diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index d6f8389b94c4b9..fe6dcc96907d40 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -66,6 +66,8 @@ class OwlViTProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] image_processor_class = "OwlViTImageProcessor" tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") + # For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details. + optional_call_args = ["query_images"] def __init__(self, image_processor=None, tokenizer=None, **kwargs): feature_extractor = None @@ -89,6 +91,10 @@ def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, images: Optional[ImageInput] = None, + # The following is to capture `visual_prompt` argument that may be passed as a positional argument. + # See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details. + # This behavior is only needed for backward compatibility and will be removed in future versions. + *args, audio=None, videos=None, **kwargs: Unpack[OwlViTProcessorKwargs], @@ -101,15 +107,15 @@ def __call__( of the above two methods for more information. Args: - text (`str`, `List[str]`, `List[List[str]]`): + text (`str`, `List[str]`, `List[List[str]]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, - `List[torch.Tensor]`): + `List[torch.Tensor]`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. - query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*): The query image to be prepared, one query image is expected per target image to be queried. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. @@ -127,22 +133,9 @@ def __call__( OwlViTProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, + **self.prepare_and_validate_optional_call_args(*args), ) - if output_kwargs["images_kwargs"].get("query_images") is not None and audio is not None: - raise ValueError( - "You cannot provide `query_images` as a positional argument and as a keyword argument at the same time." - "Please provide it only as a keyword argument (i.e. `query_images=...`)." - ) - if "query_images" not in output_kwargs["images_kwargs"]: - warnings.warn( - "No `query_images` kwarg was detected. The use of `query_images` as an argument without specifying it explicitely as `query_images=` will be deprecated in future versions." - ) - # For backwards compatibility, we reuse `audio` as `query_images` in case - # downstream users passed it as a positional argument - if audio is not None: - output_kwargs["images_kwargs"]["query_images"] = audio - query_images = output_kwargs["images_kwargs"].pop("query_images", None) return_tensors = output_kwargs["common_kwargs"]["return_tensors"] diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 09f62481956e77..98d0ad5f5bb70b 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -35,7 +35,9 @@ from .tokenization_utils_base import ( PaddingStrategy, + PreTokenizedInput, PreTrainedTokenizerBase, + TextInput, TruncationStrategy, ) from .utils import ( @@ -106,6 +108,9 @@ class TextKwargs(TypedDict, total=False): The side on which padding will be applied. """ + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] + text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] + text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] add_special_tokens: Optional[bool] padding: Union[bool, str, PaddingStrategy] truncation: Union[bool, str, TruncationStrategy] @@ -317,6 +322,7 @@ class ProcessorMixin(PushToHubMixin): attributes = ["feature_extractor", "tokenizer"] optional_attributes = ["chat_template"] + optional_call_args: List[str] = [] # Names need to be attr_class for attr in attributes feature_extractor_class = None tokenizer_class = None @@ -956,6 +962,62 @@ def validate_init_kwargs(processor_config, valid_kwargs): unused_kwargs = {k: processor_config[k] for k in unused_keys} return unused_kwargs + def prepare_and_validate_optional_call_args(self, *args): + """ + Matches optional positional arguments to their corresponding names in `optional_call_args` + in the processor class in the order they are passed to the processor call. + + Note that this should only be used in the `__call__` method of the processors with special + arguments. Special arguments are arguments that aren't `text`, `images`, `audio`, nor `videos` + but also aren't passed to the tokenizer, image processor, etc. Examples of such processors are: + - `CLIPSegProcessor` + - `LayoutLMv2Processor` + - `OwlViTProcessor` + + Also note that passing by position to the processor call is now deprecated and will be disallowed + in future versions. We only have this for backward compatibility. + + Example: + Suppose that the processor class has `optional_call_args = ["arg_name_1", "arg_name_2"]`. + And we define the call method as: + ```python + def __call__( + self, + text: str, + images: Optional[ImageInput] = None, + *arg, + audio=None, + videos=None, + ) + ``` + + Then, if we call the processor as: + ```python + images = [...] + processor("What is common in these images?", images, "arg_value_1", "arg_value_2") + ``` + + Then, this method will return: + ```python + { + "arg_name_1": "arg_value_1", + "arg_name_2": "arg_value_2", + } + ``` + which we could then pass as kwargs to `self._merge_kwargs` + """ + if len(args): + warnings.warn( + "Passing positional arguments to the processor call is now deprecated and will be disallowed in future versions. " + "Please pass all arguments as keyword arguments." + ) + if len(args) > len(self.optional_call_args): + raise ValueError( + f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call but received {len(args)}." + "Passing positional arguments to the processor call is not recommended" + ) + return {arg_name: arg_value for arg_value, arg_name in zip(args, self.optional_call_args)} + def apply_chat_template( self, conversation: Union[List[Dict[str, str]]], diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py index bfc7d59c1b0ff3..a535219a23fb0c 100644 --- a/tests/models/clipseg/test_processor_clipseg.py +++ b/tests/models/clipseg/test_processor_clipseg.py @@ -193,6 +193,23 @@ def test_processor_visual_prompt(self): with pytest.raises(ValueError): processor() + def test_processor_visual_prompt_positional(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + visual_prompt_input = self.prepare_image_inputs() + + inputs = processor(None, image_input, visual_prompt_input) + + self.assertListEqual(list(inputs.keys()), ["pixel_values", "conditional_pixel_values"]) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + def test_tokenizer_decode(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 577341fe531b6e..bad0eb8cd6b72b 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -26,6 +26,7 @@ import unittest import numpy as np +from huggingface_hub import hf_hub_download from transformers import CLIPTokenizerFast, ProcessorMixin from transformers.models.auto.processing_auto import processor_class_from_name @@ -48,9 +49,10 @@ @require_vision @require_torch class ProcessorTesterMixin: - image_data_arg_name = "pixel_values" - text_data_arg_name = "input_ids" processor_class = None + text_data_arg_name = "input_ids" + images_data_arg_name = "pixel_values" + videos_data_arg_name = "pixel_values_videos" def prepare_processor_dict(self): return {} @@ -90,6 +92,13 @@ def prepare_image_inputs(self): image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] return image_inputs + @require_vision + def prepare_video_inputs(self): + video_file = hf_hub_download( + repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" + ) + return [np.load(video_file)] + def test_processor_to_json_string(self): processor = self.get_processor() obj = json.loads(processor.to_json_string()) @@ -129,43 +138,69 @@ def skip_processor_without_typed_kwargs(self, processor): def test_tokenizer_defaults_preserved_by_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + processor_components = self.prepare_components() + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 117) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 117) @require_torch @require_vision def test_image_processor_defaults_preserved_by_image_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size=(234, 234)) + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component( + "image_processor", size=(234, 234), crop_size=(234, 234) + ) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 234) + + @require_torch + @require_vision + def test_video_processor_defaults_preserved_by_kwargs(self): + if "video_processor" not in self.processor_class.attributes: + self.skipTest(f"video_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", size=(234, 234), crop_size=(234, 234)) + video_processor = self.get_component("video_processor", size=(234, 234), crop_size=(234, 234)) tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class( + tokenizer=tokenizer, + image_processor=image_processor, + video_processor=video_processor, + ) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() + video_input = self.prepare_video_inputs() - inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs[self.image_data_arg_name][0][0]), 234) + inputs = processor(text=input_str, images=image_input, videos=video_input, return_tensors="pt") + self.assertEqual(inputs[self.videos_data_arg_name].shape[-1], 234) @require_vision @require_torch def test_kwargs_overrides_default_tokenizer_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", padding="longest") + processor_components = self.prepare_components() + processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -173,34 +208,35 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): inputs = processor( text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" ) - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 112) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 112) @require_torch @require_vision def test_kwargs_overrides_default_image_processor_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component("image_processor", size=(234, 234)) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, size=[224, 224]) - self.assertEqual(len(inputs[self.image_data_arg_name][0][0]), 224) + inputs = processor( + text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224), return_tensors="pt" + ) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 224) @require_torch @require_vision def test_unstructured_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" @@ -210,22 +246,21 @@ def test_unstructured_kwargs(self): images=image_input, return_tensors="pt", size={"height": 214, "width": 214}, + crop_size={"height": 214, "width": 214}, padding="max_length", max_length=76, ) - self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214) - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) @require_torch @require_vision def test_unstructured_kwargs_batched(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer", "upper older longer string"] @@ -235,23 +270,21 @@ def test_unstructured_kwargs_batched(self): images=image_input, return_tensors="pt", size={"height": 214, "width": 214}, + crop_size={"height": 214, "width": 214}, padding="longest", max_length=76, ) - self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214) - - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 6) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), len(inputs[self.text_data_arg_name][1])) @require_torch @require_vision def test_doubly_passed_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer"] @@ -262,6 +295,8 @@ def test_doubly_passed_kwargs(self): images=image_input, images_kwargs={"size": {"height": 222, "width": 222}}, size={"height": 214, "width": 214}, + crop_size={"height": 214, "width": 214}, + return_tensors="pt", ) @require_torch @@ -269,10 +304,8 @@ def test_doubly_passed_kwargs(self): def test_structured_kwargs_nested(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" @@ -281,42 +314,44 @@ def test_structured_kwargs_nested(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 214, "width": 214}}, + "images_kwargs": { + "size": {"height": 214, "width": 214}, + "crop_size": {"height": 214, "width": 214}, + }, "text_kwargs": {"padding": "max_length", "max_length": 76}, } inputs = processor(text=input_str, images=image_input, **all_kwargs) self.skip_processor_without_typed_kwargs(processor) - self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214) - - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) @require_torch @require_vision def test_structured_kwargs_nested_from_dict(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" image_input = self.prepare_image_inputs() # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 214, "width": 214}}, + "images_kwargs": { + "size": {"height": 214, "width": 214}, + "crop_size": {"height": 214, "width": 214}, + }, "text_kwargs": {"padding": "max_length", "max_length": 76}, } inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214) - - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) class MyProcessor(ProcessorMixin): From 8c36cfb46f51259313bb82941652827cbe20852a Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Mon, 19 Aug 2024 15:51:01 +0800 Subject: [PATCH 05/10] address @zucchini's comments --- .../models/clipseg/processing_clipseg.py | 7 +++-- .../models/nougat/processing_nougat.py | 30 +++++++++++++++++-- .../models/owlv2/processing_owlv2.py | 11 +++---- .../models/owlvit/processing_owlvit.py | 11 +++---- src/transformers/processing_utils.py | 6 ++-- 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index 7219fc7d4831ea..f99a8231fe343d 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -21,8 +21,9 @@ from typing import List, Optional, Union from ...image_utils import ImageInput +from ...feature_extraction_utils import BatchFeature from ...processing_utils import ProcessingKwargs, ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput +from ...tokenization_utils_base import PreTokenizedInput, TextInput if sys.version_info >= (3, 11): @@ -106,7 +107,7 @@ def __call__( (C, H, W), where C is a number of channels, H and W are image height and width. Returns: - [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when @@ -156,7 +157,7 @@ def __call__( } return encoding else: - return BatchEncoding( + return BatchFeature( data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"].get("return_tensors") ) diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py index cbb43449dbe355..138085e8b73ce9 100644 --- a/src/transformers/models/nougat/processing_nougat.py +++ b/src/transformers/models/nougat/processing_nougat.py @@ -71,8 +71,8 @@ class NougatProcessor(ProcessorMixin): """ attributes = ["image_processor", "tokenizer"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" + image_processor_class = "NougatImageProcessor" + tokenizer_class = "NougatTokenizerFast" def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) @@ -86,6 +86,32 @@ def __call__( videos=None, **kwargs: Unpack[NougatProcessorKwargs], ): + """ + Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and + `kwargs` arguments to NougatTokenizerFast's [`~NougatTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to + NougatImageProcessor's [`~NougatImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, + `List[torch.Tensor]`, *optional*): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **labels** -- List of label token ids to be fed to a model. Returned when both `text` and `images` are not `None`. + """ if images is None and text is None: raise ValueError("You need to specify either an `images` or `text` input to process.") diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index 36af9c29c69da7..ba8be3c115e14a 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -22,8 +22,9 @@ import numpy as np from ...image_utils import ImageInput +from ...feature_extraction_utils import BatchFeature from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput +from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_flax_available, is_tf_available, is_torch_available @@ -105,7 +106,7 @@ def __call__( should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. Returns: - [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not @@ -172,12 +173,12 @@ def __call__( else: raise ValueError("Target return tensor type could not be returned") - encoding = BatchEncoding() + encoding = BatchFeature() encoding["input_ids"] = input_ids encoding["attention_mask"] = attention_mask if query_images is not None: - encoding = BatchEncoding() + encoding = BatchFeature() query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values encoding["query_pixel_values"] = query_pixel_values @@ -193,7 +194,7 @@ def __call__( elif text is not None or query_images is not None: return encoding else: - return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2 def post_process_object_detection(self, *args, **kwargs): diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index fe6dcc96907d40..5c676ce8ea0c4f 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -23,8 +23,9 @@ import numpy as np from ...image_utils import ImageInput +from ...feature_extraction_utils import BatchFeature from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput +from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_flax_available, is_tf_available, is_torch_available @@ -121,7 +122,7 @@ def __call__( should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. Returns: - [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not @@ -189,12 +190,12 @@ def __call__( else: raise ValueError("Target return tensor type could not be returned") - encoding = BatchEncoding() + encoding = BatchFeature() encoding["input_ids"] = input_ids encoding["attention_mask"] = attention_mask if query_images is not None: - encoding = BatchEncoding() + encoding = BatchFeature() query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values encoding["query_pixel_values"] = query_pixel_values @@ -210,7 +211,7 @@ def __call__( elif text is not None or query_images is not None: return encoding else: - return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) def post_process(self, *args, **kwargs): """ diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 98d0ad5f5bb70b..f13a45ae66bd9d 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1013,8 +1013,10 @@ def __call__( ) if len(args) > len(self.optional_call_args): raise ValueError( - f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call but received {len(args)}." - "Passing positional arguments to the processor call is not recommended" + f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call" + f"which will be matched with {' '.join(self.optional_call_args)} in the order they are passed." + f"However, got {len(args)} positional arguments instead." + "Please pass all arguments as keyword arguments instead (e.g. `processor(arg_name_1=..., arg_name_2=...))`." ) return {arg_name: arg_value for arg_value, arg_name in zip(args, self.optional_call_args)} From 3a2f7ef0a56a41910cf4a480a12a83fdf15ee61a Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Mon, 19 Aug 2024 16:00:55 +0800 Subject: [PATCH 06/10] fix docs --- src/transformers/models/clipseg/processing_clipseg.py | 11 ++++++----- src/transformers/models/nougat/processing_nougat.py | 8 ++++---- src/transformers/models/owlv2/processing_owlv2.py | 3 ++- src/transformers/models/owlvit/processing_owlvit.py | 3 ++- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index f99a8231fe343d..3c3995e46ac7e3 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -20,8 +20,8 @@ import warnings from typing import List, Optional, Union -from ...image_utils import ImageInput from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -109,11 +109,12 @@ def __call__( Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: - - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None` and `visual_prompt` is `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not - `None`). + `None`) and `visual_prompt` is `None`. - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **conditional_pixel_values** -- Conditional pixel values to be fed to a model. Returned when `visual_prompt` is not `None`. """ output_kwargs = self._merge_kwargs( @@ -145,7 +146,7 @@ def __call__( "pixel_values": image_features.pixel_values, "conditional_pixel_values": prompt_features.pixel_values, } - return encoding + return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) elif text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values return encoding @@ -155,7 +156,7 @@ def __call__( encoding = { "conditional_pixel_values": prompt_features.pixel_values, } - return encoding + return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) else: return BatchFeature( data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"].get("return_tensors") diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py index 138085e8b73ce9..21baf3e3cc1a22 100644 --- a/src/transformers/models/nougat/processing_nougat.py +++ b/src/transformers/models/nougat/processing_nougat.py @@ -94,14 +94,14 @@ def __call__( of the above two methods for more information. Args: - text (`str`, `List[str]`, `List[List[str]]`, *optional*): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. + text (`str`, `List[str]`, `List[List[str]]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index ba8be3c115e14a..1844c7237f98cf 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -21,8 +21,8 @@ import numpy as np -from ...image_utils import ImageInput from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_flax_available, is_tf_available, is_torch_available @@ -112,6 +112,7 @@ def __call__( `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`. """ output_kwargs = self._merge_kwargs( Owlv2ProcessorKwargs, diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index 5c676ce8ea0c4f..03c530744c7dfe 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -22,8 +22,8 @@ import numpy as np -from ...image_utils import ImageInput from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_flax_available, is_tf_available, is_torch_available @@ -128,6 +128,7 @@ def __call__( `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`. """ output_kwargs = self._merge_kwargs( From a280b3ac41f1adcb6bd178875c622f2496d676bb Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Mon, 19 Aug 2024 16:07:26 +0800 Subject: [PATCH 07/10] rm video testing --- tests/test_processing_common.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index bad0eb8cd6b72b..05cc96d3ce76dc 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -26,7 +26,6 @@ import unittest import numpy as np -from huggingface_hub import hf_hub_download from transformers import CLIPTokenizerFast, ProcessorMixin from transformers.models.auto.processing_auto import processor_class_from_name @@ -92,13 +91,6 @@ def prepare_image_inputs(self): image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] return image_inputs - @require_vision - def prepare_video_inputs(self): - video_file = hf_hub_download( - repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" - ) - return [np.load(video_file)] - def test_processor_to_json_string(self): processor = self.get_processor() obj = json.loads(processor.to_json_string()) @@ -169,29 +161,6 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): inputs = processor(text=input_str, images=image_input, return_tensors="pt") self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 234) - @require_torch - @require_vision - def test_video_processor_defaults_preserved_by_kwargs(self): - if "video_processor" not in self.processor_class.attributes: - self.skipTest(f"video_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size=(234, 234), crop_size=(234, 234)) - video_processor = self.get_component("video_processor", size=(234, 234), crop_size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class( - tokenizer=tokenizer, - image_processor=image_processor, - video_processor=video_processor, - ) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - video_input = self.prepare_video_inputs() - - inputs = processor(text=input_str, images=image_input, videos=video_input, return_tensors="pt") - self.assertEqual(inputs[self.videos_data_arg_name].shape[-1], 234) - @require_vision @require_torch def test_kwargs_overrides_default_tokenizer_kwargs(self): From ca925cc872a147ca501e15f53d8174e3e57a824d Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Tue, 20 Aug 2024 19:34:07 +0800 Subject: [PATCH 08/10] make processor call implementations simpler too --- .../models/clipseg/processing_clipseg.py | 33 +++++-------------- .../models/nougat/processing_nougat.py | 27 ++++++++------- .../models/owlv2/processing_owlv2.py | 31 +++++++++-------- .../models/owlvit/processing_owlvit.py | 29 ++++++++-------- 4 files changed, 49 insertions(+), 71 deletions(-) diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index 3c3995e46ac7e3..55326d6147243a 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -85,7 +85,7 @@ def __call__( audio=None, videos=None, **kwargs: Unpack[CLIPSegProcessorKwargs], - ): + ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode @@ -132,35 +132,18 @@ def __call__( if text is not None and visual_prompt is not None: raise ValueError("You have to specify exactly one type of prompt. Either text or visual prompt.") + data = {} if text is not None: - encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) - + text_features = self.tokenizer(text, **output_kwargs["text_kwargs"]) + data.update(text_features) if visual_prompt is not None: - prompt_features = self.image_processor(visual_prompt, **output_kwargs["images_kwargs"]) - + prompt_image_features = self.image_processor(visual_prompt, **output_kwargs["images_kwargs"]) + data["conditional_pixel_values"] = prompt_image_features.pixel_values if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + data["pixel_values"] = image_features.pixel_values - if visual_prompt is not None and images is not None: - encoding = { - "pixel_values": image_features.pixel_values, - "conditional_pixel_values": prompt_features.pixel_values, - } - return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) - elif text is not None and images is not None: - encoding["pixel_values"] = image_features.pixel_values - return encoding - elif text is not None: - return encoding - elif visual_prompt is not None: - encoding = { - "conditional_pixel_values": prompt_features.pixel_values, - } - return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) - else: - return BatchFeature( - data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"].get("return_tensors") - ) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py index 21baf3e3cc1a22..638659ff49f6af 100644 --- a/src/transformers/models/nougat/processing_nougat.py +++ b/src/transformers/models/nougat/processing_nougat.py @@ -19,6 +19,7 @@ import sys from typing import List, Optional, Union +from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -85,7 +86,7 @@ def __call__( audio=None, videos=None, **kwargs: Unpack[NougatProcessorKwargs], - ): + ) -> BatchFeature: """ Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and `kwargs` arguments to NougatTokenizerFast's [`~NougatTokenizerFast.__call__`] if `text` is not `None` to encode @@ -105,12 +106,12 @@ def __call__( Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: - - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **labels** -- List of token ids to be fed to a model. Returned when both `text` and `images` are not `None`. + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None` and `images` is `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. - - **labels** -- List of label token ids to be fed to a model. Returned when both `text` and `images` are not `None`. """ if images is None and text is None: raise ValueError("You need to specify either an `images` or `text` input to process.") @@ -123,18 +124,16 @@ def __call__( # Temporary fix for "paddding_side" in init_kwargs _ = output_kwargs["text_kwargs"].pop("padding_side", None) - if images is not None: - inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) + data = {} if text is not None: - encodings = self.tokenizer(text, **output_kwargs["text_kwargs"]) - - if text is None: - return inputs - elif images is None: - return encodings - else: - inputs["labels"] = encodings["input_ids"] - return inputs + text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + data.update(text_features) + if images is not None: + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + data.update(image_features) + if "input_ids" in data: + data["labels"] = data.pop("input_ids") + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index 1844c7237f98cf..5189bea0e6b41c 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -17,6 +17,7 @@ """ import sys +import warnings from typing import List, Optional, Union import numpy as np @@ -83,7 +84,7 @@ def __call__( audio=None, videos=None, **kwargs: Unpack[Owlv2ProcessorKwargs], - ): + ) -> BatchFeature: """ Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode: @@ -114,6 +115,7 @@ def __call__( - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. - **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`. """ + output_kwargs = self._merge_kwargs( Owlv2ProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, @@ -128,6 +130,12 @@ def __call__( raise ValueError( "You have to specify at least one text or query image or image. All three cannot be none." ) + if text is not None and query_images is not None: + warnings.warn( + "Query images will override the text prompt. In the future, this will raise an error.", FutureWarning + ) + + data = {} if text is not None: if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)): @@ -174,28 +182,19 @@ def __call__( else: raise ValueError("Target return tensor type could not be returned") - encoding = BatchFeature() - encoding["input_ids"] = input_ids - encoding["attention_mask"] = attention_mask + data["input_ids"] = input_ids + data["attention_mask"] = attention_mask if query_images is not None: - encoding = BatchFeature() query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values - encoding["query_pixel_values"] = query_pixel_values + # Query images always override the text prompt + data = {"query_pixel_values": query_pixel_values} if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + data["pixel_values"] = image_features.pixel_values - if text is not None and images is not None: - encoding["pixel_values"] = image_features.pixel_values - return encoding - elif query_images is not None and images is not None: - encoding["pixel_values"] = image_features.pixel_values - return encoding - elif text is not None or query_images is not None: - return encoding - else: - return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) + return BatchFeature(data=data, tensor_type=return_tensors) # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2 def post_process_object_detection(self, *args, **kwargs): diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index 03c530744c7dfe..5860f1043625f6 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -99,7 +99,7 @@ def __call__( audio=None, videos=None, **kwargs: Unpack[OwlViTProcessorKwargs], - ): + ) -> BatchFeature: """ Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode: @@ -145,6 +145,12 @@ def __call__( raise ValueError( "You have to specify at least one text or query image or image. All three cannot be none." ) + if text is not None and query_images is not None: + warnings.warn( + "Query images will override the text prompt. In the future, this will raise an error.", FutureWarning + ) + + data = {} if text is not None: if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)): @@ -191,28 +197,19 @@ def __call__( else: raise ValueError("Target return tensor type could not be returned") - encoding = BatchFeature() - encoding["input_ids"] = input_ids - encoding["attention_mask"] = attention_mask + data["input_ids"] = input_ids + data["attention_mask"] = attention_mask if query_images is not None: - encoding = BatchFeature() query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values - encoding["query_pixel_values"] = query_pixel_values + # Query images always override the text prompt + data = {"query_pixel_values": query_pixel_values} if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + data["pixel_values"] = image_features.pixel_values - if text is not None and images is not None: - encoding["pixel_values"] = image_features.pixel_values - return encoding - elif query_images is not None and images is not None: - encoding["pixel_values"] = image_features.pixel_values - return encoding - elif text is not None or query_images is not None: - return encoding - else: - return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) + return BatchFeature(data=data, tensor_type=return_tensors) def post_process(self, *args, **kwargs): """ From 71a7ee109345438a27344e4577a38be0bddb9e34 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Wed, 21 Aug 2024 00:47:34 +0800 Subject: [PATCH 09/10] fix test for clipseg and add more tests for owl models --- tests/models/clipseg/test_processor_clipseg.py | 2 +- tests/models/owlv2/test_processor_owlv2.py | 17 +++++++++++++++++ tests/models/owlvit/test_processor_owlvit.py | 15 +++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py index a535219a23fb0c..7ac937bfa1ff99 100644 --- a/tests/models/clipseg/test_processor_clipseg.py +++ b/tests/models/clipseg/test_processor_clipseg.py @@ -187,7 +187,7 @@ def test_processor_visual_prompt(self): inputs = processor(images=image_input, visual_prompt=visual_prompt_input) - self.assertListEqual(list(inputs.keys()), ["pixel_values", "conditional_pixel_values"]) + self.assertListEqual(list(inputs.keys()), ["conditional_pixel_values", "pixel_values"]) # test if it raises when no input is passed with pytest.raises(ValueError): diff --git a/tests/models/owlv2/test_processor_owlv2.py b/tests/models/owlv2/test_processor_owlv2.py index b8f8b5d26cfe41..eadbb7c074fee9 100644 --- a/tests/models/owlv2/test_processor_owlv2.py +++ b/tests/models/owlv2/test_processor_owlv2.py @@ -1,6 +1,8 @@ import tempfile import unittest +import pytest + from transformers import Owlv2Processor from transformers.testing_utils import require_scipy @@ -16,3 +18,18 @@ def setUp(self): self.tmpdirname = tempfile.mkdtemp() processor = self.processor_class.from_pretrained(self.from_pretrained_id) processor.save_pretrained(self.tmpdirname) + + def test_processor_query_images_positional(self): + processor_components = self.prepare_components() + processor = Owlv2Processor(**processor_components) + + image_input = self.prepare_image_inputs() + query_images = self.prepare_image_inputs() + + inputs = processor(None, image_input, query_images) + + self.assertListEqual(list(inputs.keys()), ["query_pixel_values", "pixel_values"]) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() diff --git a/tests/models/owlvit/test_processor_owlvit.py b/tests/models/owlvit/test_processor_owlvit.py index 3fadfac0046002..698882233b875d 100644 --- a/tests/models/owlvit/test_processor_owlvit.py +++ b/tests/models/owlvit/test_processor_owlvit.py @@ -258,3 +258,18 @@ def test_tokenizer_decode(self): decoded_tok = tokenizer.batch_decode(predicted_ids) self.assertListEqual(decoded_tok, decoded_processor) + + def test_processor_query_images_positional(self): + processor_components = self.prepare_components() + processor = OwlViTProcessor(**processor_components) + + image_input = self.prepare_image_inputs() + query_images = self.prepare_image_inputs() + + inputs = processor(None, image_input, query_images) + + self.assertListEqual(list(inputs.keys()), ["query_pixel_values", "pixel_values"]) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() From 274b61573a16be1cfab92d11d38448c29f914da1 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Wed, 21 Aug 2024 01:04:46 +0800 Subject: [PATCH 10/10] fix test for clipseg --- tests/models/clipseg/test_processor_clipseg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py index 7ac937bfa1ff99..2359201f7c9793 100644 --- a/tests/models/clipseg/test_processor_clipseg.py +++ b/tests/models/clipseg/test_processor_clipseg.py @@ -204,7 +204,7 @@ def test_processor_visual_prompt_positional(self): inputs = processor(None, image_input, visual_prompt_input) - self.assertListEqual(list(inputs.keys()), ["pixel_values", "conditional_pixel_values"]) + self.assertListEqual(list(inputs.keys()), ["conditional_pixel_values", "pixel_values"]) # test if it raises when no input is passed with pytest.raises(ValueError):