From 2f4163afdf72f2e57e36c28c6c1f3d77056c86d0 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Fri, 16 Aug 2024 15:38:35 +0800 Subject: [PATCH 01/25] uniformize kwargs of Chameleon --- .../models/chameleon/processing_chameleon.py | 81 ++++++++++--------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 1480808336d14e..2cac2d4bcb986a 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -16,13 +16,36 @@ Processor class for Chameleon. """ +import sys from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from ...utils import TensorType +from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs +from ...tokenization_utils_base import PreTokenizedInput, TextInput + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class ChameleonTextKwargs(TextKwargs, total=False): + return_for_text_completion: bool + + +class ChameleonProcessorKwargs(ProcessingKwargs, total=False): + text_kwargs: ChameleonTextKwargs + _defaults = { + "text_kwargs": { + "padding": False, + "stride": 0, + "return_for_text_completion": False, + }, + "common_kwargs": { + "return_tensors": "pt", + }, + } class ChameleonProcessor(ProcessorMixin): @@ -57,13 +80,9 @@ def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, ima def __call__( self, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - images: ImageInput = None, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length: int = None, - return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, - return_for_text_completion: bool = False, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + images: Optional[ImageInput] = None, + **kwargs: Unpack[ChameleonProcessorKwargs], ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` @@ -80,26 +99,6 @@ def __call__( images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding - index) among: - - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single - sequence if provided). - - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum - acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different - lengths). - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding length (see above). - truncation (`bool`, *optional*): - Activates truncation to cut input sequences longer than `max_length` to `max_length`. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: @@ -114,6 +113,15 @@ def __call__( text = [text] elif not isinstance(text, list) and not isinstance(text[0], str): raise TypeError("Invalid input text. Please provide a string, or a list of strings") + if text is None and images is None: + raise ValueError("You must provide either text or images") + + output_kwargs = self._merge_kwargs( + ChameleonProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + return_for_text_completion = output_kwargs["text_kwargs"].pop("return_for_text_completion", False) # Replace the image token with the expanded image token sequence prompt_strings = [] @@ -124,19 +132,12 @@ def __call__( sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode prompt_strings.append(sample) - data = self.tokenizer( - prompt_strings, - return_tensors=return_tensors, - padding=padding, - truncation=truncation, - max_length=max_length, - ) + data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) if images is not None: - pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"] - data["pixel_values"] = pixel_values + data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] - return BatchFeature(data=data, tensor_type=return_tensors) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"]) # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama def batch_decode(self, *args, **kwargs): From 258814458d5fdbf3282a86b46c4a794ca9ccb8b4 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Fri, 16 Aug 2024 15:41:52 +0800 Subject: [PATCH 02/25] fix linter nit --- src/transformers/models/chameleon/processing_chameleon.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 2cac2d4bcb986a..a039101f56d71e 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -24,6 +24,7 @@ from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs from ...tokenization_utils_base import PreTokenizedInput, TextInput + if sys.version_info >= (3, 11): from typing import Unpack else: From 6454130ad1b278beaa682c5311e30020c013013c Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Fri, 16 Aug 2024 17:38:16 +0800 Subject: [PATCH 03/25] rm stride default --- src/transformers/models/chameleon/processing_chameleon.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index a039101f56d71e..d999267ef1fa9c 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -40,7 +40,6 @@ class ChameleonProcessorKwargs(ProcessingKwargs, total=False): _defaults = { "text_kwargs": { "padding": False, - "stride": 0, "return_for_text_completion": False, }, "common_kwargs": { From 9949e722b40261e7d088070e2dec7034220807be Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Fri, 16 Aug 2024 17:43:30 +0800 Subject: [PATCH 04/25] add tests for chameleon processor --- .../models/chameleon/test_processor_chameleon.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 tests/models/chameleon/test_processor_chameleon.py diff --git a/tests/models/chameleon/test_processor_chameleon.py b/tests/models/chameleon/test_processor_chameleon.py new file mode 100644 index 00000000000000..74314e3d4c1e95 --- /dev/null +++ b/tests/models/chameleon/test_processor_chameleon.py @@ -0,0 +1,16 @@ +import tempfile +import unittest + +from transformers import ChameleonProcessor + +from ...test_processing_common import ProcessorTesterMixin + + +class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "leloy/Anole-7b-v0.1-hf" + processor_class = ChameleonProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) From 58c6b53661a1d6111710d37f02ea7a489410e0f1 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Fri, 16 Aug 2024 18:07:33 +0800 Subject: [PATCH 05/25] fix tests --- .../chameleon/test_processor_chameleon.py | 17 +++++++++++++++++ tests/test_processing_common.py | 7 +++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/models/chameleon/test_processor_chameleon.py b/tests/models/chameleon/test_processor_chameleon.py index 74314e3d4c1e95..1efeaa5339d304 100644 --- a/tests/models/chameleon/test_processor_chameleon.py +++ b/tests/models/chameleon/test_processor_chameleon.py @@ -2,6 +2,7 @@ import unittest from transformers import ChameleonProcessor +from transformers.models.auto.processing_auto import processor_class_from_name from ...test_processing_common import ProcessorTesterMixin @@ -10,6 +11,22 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase): from_pretrained_id = "leloy/Anole-7b-v0.1-hf" processor_class = ChameleonProcessor + def get_component(self, attribute, **kwargs): + assert attribute in self.processor_class.attributes + component_class_name = getattr(self.processor_class, f"{attribute}_class") + if isinstance(component_class_name, tuple): + if "_fast" in component_class_name[0]: + component_class_name = component_class_name[0] + else: + component_class_name = component_class_name[1] + + component_class = processor_class_from_name(component_class_name) + component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa + if attribute == "tokenizer" and not component.pad_token: + component.pad_token = "[TEST_PAD]" + + return component + def setUp(self): self.tmpdirname = tempfile.mkdtemp() processor = self.processor_class.from_pretrained(self.from_pretrained_id) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index a30c6363b9d7ff..31f14e7294380d 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -143,7 +143,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): def test_image_processor_defaults_preserved_by_image_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size=(234, 234)) + image_processor = self.get_component("image_processor", size=(234, 234), crop_size=(234, 234)) tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) @@ -187,7 +187,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self): input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, size=[224, 224]) + inputs = processor(text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224)) self.assertEqual(len(inputs["pixel_values"][0][0]), 224) @require_torch @@ -208,6 +208,7 @@ def test_unstructured_kwargs(self): images=image_input, return_tensors="pt", size={"height": 214, "width": 214}, + crop_size={"height": 214, "width": 214}, padding="max_length", max_length=76, ) @@ -233,6 +234,7 @@ def test_unstructured_kwargs_batched(self): images=image_input, return_tensors="pt", size={"height": 214, "width": 214}, + crop_size={"height": 214, "width": 214}, padding="longest", max_length=76, ) @@ -260,6 +262,7 @@ def test_doubly_passed_kwargs(self): images=image_input, images_kwargs={"size": {"height": 222, "width": 222}}, size={"height": 214, "width": 214}, + crop_size={"height": 214, "width": 214}, ) @require_torch From 6592ce3d9e6de63a6ab8027f158701b437c2bda4 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Fri, 16 Aug 2024 18:34:41 +0800 Subject: [PATCH 06/25] fix chameleon tests --- tests/test_processing_common.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 31f14e7294380d..6a0bdd5e86349a 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -282,7 +282,10 @@ def test_structured_kwargs_nested(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 214, "width": 214}}, + "images_kwargs": { + "size": {"height": 214, "width": 214}, + "crop_size": {"height": 214, "width": 214}, + }, "text_kwargs": {"padding": "max_length", "max_length": 76}, } @@ -310,7 +313,10 @@ def test_structured_kwargs_nested_from_dict(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 214, "width": 214}}, + "images_kwargs": { + "size": {"height": 214, "width": 214}, + "crop_size": {"height": 214, "width": 214}, + }, "text_kwargs": {"padding": "max_length", "max_length": 76}, } From c4f5474b25867f486ecd8224dd4efa41aea54bdd Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Fri, 16 Aug 2024 21:59:58 +0800 Subject: [PATCH 07/25] don't hardcode arg names --- tests/test_processing_common.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 6a0bdd5e86349a..ec1e211872e667 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -49,6 +49,8 @@ @require_torch class ProcessorTesterMixin: processor_class = None + text_data_arg_name = "input_ids" + images_data_arg_name = "pixel_values" def prepare_processor_dict(self): return {} @@ -136,7 +138,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(len(inputs["input_ids"][0]), 117) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 117) @require_torch @require_vision @@ -153,7 +155,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs["pixel_values"][0][0]), 234) + self.assertEqual(len(inputs[self.images_data_arg_name][0][0]), 234) @require_vision @require_torch @@ -171,7 +173,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): inputs = processor( text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" ) - self.assertEqual(len(inputs["input_ids"][0]), 112) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 112) @require_torch @require_vision @@ -188,7 +190,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self): image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224)) - self.assertEqual(len(inputs["pixel_values"][0][0]), 224) + self.assertEqual(len(inputs[self.images_data_arg_name][0][0]), 224) @require_torch @require_vision @@ -213,8 +215,8 @@ def test_unstructured_kwargs(self): max_length=76, ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) + self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) @require_torch @require_vision @@ -239,9 +241,9 @@ def test_unstructured_kwargs_batched(self): max_length=76, ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) + self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 6) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 6) @require_torch @require_vision @@ -292,9 +294,9 @@ def test_structured_kwargs_nested(self): inputs = processor(text=input_str, images=image_input, **all_kwargs) self.skip_processor_without_typed_kwargs(processor) - self.assertEqual(inputs["pixel_values"].shape[2], 214) + self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) @require_torch @require_vision @@ -321,9 +323,9 @@ def test_structured_kwargs_nested_from_dict(self): } inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[2], 214) + self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) class MyProcessor(ProcessorMixin): From ce9cc731d46294fa5c095f713ce3b6d9e8c91b4f Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Sat, 17 Aug 2024 14:16:08 +0800 Subject: [PATCH 08/25] uniformize processor kwargs of altclip, bridgetower, flava, instructblipvideo, llava_next, llava_next_video, siglip, video_llava, vilt --- .../models/altclip/processing_altclip.py | 38 +++- .../image_processing_bridgetower.py | 4 +- .../models/flava/processing_flava.py | 89 +++++---- .../processing_instructblipvideo.py | 102 +++++----- .../llava_next/image_processing_llava_next.py | 6 +- .../processing_llava_next_video.py | 73 ++++--- .../models/siglip/processing_siglip.py | 68 +++---- .../video_llava/processing_video_llava.py | 74 ++++---- .../models/vilt/image_processing_vilt.py | 8 +- .../models/vilt/processing_vilt.py | 71 ++++--- .../models/altclip/test_processor_altclip.py | 16 ++ tests/models/flava/test_processor_flava.py | 10 +- .../test_processor_instructblipvideo.py | 21 +++ .../test_processor_llava_next_video.py | 16 ++ tests/models/siglip/test_processor_siglip.py | 16 ++ .../video_llava/test_processor_video_llava.py | 17 ++ tests/models/vilt/test_processor_vilt.py | 178 ++++++++++++++++++ tests/test_processing_common.py | 122 +++++++----- 18 files changed, 614 insertions(+), 315 deletions(-) create mode 100644 tests/models/altclip/test_processor_altclip.py create mode 100644 tests/models/instructblipvideo/test_processor_instructblipvideo.py create mode 100644 tests/models/llava_next_video/test_processor_llava_next_video.py create mode 100644 tests/models/siglip/test_processor_siglip.py create mode 100644 tests/models/video_llava/test_processor_video_llava.py create mode 100644 tests/models/vilt/test_processor_vilt.py diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index 2814b2d7f26e89..af9b0aee5930a6 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -16,10 +16,23 @@ Image/Text processor class for AltCLIP """ +import sys import warnings +from typing import List, Optional, Union -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class AltCLIPProcessingKwargs(ProcessingKwargs, total=False): + _defaults = {} class AltCLIPProcessor(ProcessorMixin): @@ -59,7 +72,12 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) - def __call__(self, text=None, images=None, return_tensors=None, **kwargs): + def __call__( + self, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + images: Optional[ImageInput] = None, + **kwargs: Unpack[AltCLIPProcessingKwargs], + ): """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not @@ -97,11 +115,17 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs): if text is None and images is None: raise ValueError("You have to specify either text or images. Both cannot be none.") + output_kwargs = self._merge_kwargs( + AltCLIPProcessingKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if text is not None: - encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) + encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) if images is not None: - image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values @@ -109,7 +133,9 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs): elif text is not None: return encoding else: - return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + return BatchEncoding( + data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"] + ) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 7272093715f882..49905d36b33518 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -115,8 +115,8 @@ def get_resize_output_image_size( new_width = scale * new_width new_height, new_width = int(new_height + 0.5), int(new_width + 0.5) - new_height = new_height // size_divisor * size_divisor - new_width = new_width // size_divisor * size_divisor + new_height = max(1, new_height // size_divisor) * size_divisor + new_width = max(1, new_width // size_divisor) * size_divisor return new_height, new_width diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index 7f439b040a8fd0..e98df78f6034d5 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -16,13 +16,41 @@ Image/Text processor class for FLAVA """ +import sys import warnings from typing import List, Optional, Union from ...image_utils import ImageInput -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from ...utils import TensorType +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class FlavaImagesKwargs(ImagesKwargs, total=False): + return_image_mask: Optional[bool] + return_codebook_pixels: Optional[bool] + + +class FlavaProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: FlavaImagesKwargs + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "truncation": False, + "stride": 0, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_length": False, + "verbose": True, + }, + } class FlavaProcessor(ProcessorMixin): @@ -64,23 +92,7 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, - add_special_tokens: bool = True, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = False, - max_length: Optional[int] = None, - stride: int = 0, - pad_to_multiple_of: Optional[int] = None, - return_image_mask: Optional[bool] = None, - return_codebook_pixels: Optional[bool] = None, - return_token_type_ids: Optional[bool] = None, - return_attention_mask: Optional[bool] = None, - return_overflowing_tokens: bool = False, - return_special_tokens_mask: bool = False, - return_offsets_mapping: bool = False, - return_length: bool = False, - verbose: bool = True, - return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs, + **kwargs: Unpack[FlavaProcessorKwargs], ): """ This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and @@ -92,33 +104,16 @@ def __call__( if text is None and images is None: raise ValueError("You have to specify either text or images. Both cannot be none.") + output_kwargs = self._merge_kwargs( + FlavaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if text is not None: - encoding = self.tokenizer( - text=text, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - pad_to_multiple_of=pad_to_multiple_of, - return_token_type_ids=return_token_type_ids, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_length=return_length, - verbose=verbose, - return_tensors=return_tensors, - **kwargs, - ) + encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) if images is not None: - image_features = self.image_processor( - images, - return_image_mask=return_image_mask, - return_codebook_pixels=return_codebook_pixels, - return_tensors=return_tensors, - **kwargs, - ) + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is not None and images is not None: encoding.update(image_features) @@ -126,7 +121,9 @@ def __call__( elif text is not None: return encoding else: - return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + return BatchEncoding( + data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"] + ) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index f56f8186b07d73..c38edc2048549f 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -17,26 +17,48 @@ """ import os +import sys from typing import List, Optional, Union from ...image_processing_utils import BatchFeature from ...image_utils import VideoInput -from ...processing_utils import ProcessorMixin +from ...processing_utils import ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import ( AddedToken, BatchEncoding, - PaddingStrategy, PreTokenizedInput, TextInput, - TruncationStrategy, ) -from ...utils import TensorType, logging +from ...utils import logging from ..auto import AutoTokenizer +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + logger = logging.get_logger(__name__) +class InstructBlipVideoProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "truncation": None, + "stride": 0, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_token_type_ids": False, + "return_length": False, + "verbose": True, + }, + } + + class InstructBlipVideoProcessor(ProcessorMixin): r""" Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single @@ -71,23 +93,11 @@ def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, num_query def __call__( self, - images: VideoInput = None, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - add_special_tokens: bool = True, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length: Optional[int] = None, - stride: int = 0, - pad_to_multiple_of: Optional[int] = None, - return_attention_mask: Optional[bool] = None, - return_overflowing_tokens: bool = False, - return_special_tokens_mask: bool = False, - return_offsets_mapping: bool = False, - return_token_type_ids: bool = False, - return_length: bool = False, - verbose: bool = True, - return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs, + images: Optional[VideoInput] = None, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + audio=None, + videos=None, + **kwargs: Unpack[InstructBlipVideoProcessorKwargs], ) -> BatchFeature: """ This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and @@ -95,6 +105,12 @@ def __call__( Please refer to the docstring of the above two methods for more information. """ + output_kwargs = self._merge_kwargs( + InstructBlipVideoProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + encoding = BatchFeature() if text is not None: @@ -105,21 +121,10 @@ def __call__( _text_encoding = self.tokenizer( text=text, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - pad_to_multiple_of=pad_to_multiple_of, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_token_type_ids=return_token_type_ids, - return_length=return_length, - verbose=verbose, - return_tensors=None, # required to concatenate below - **kwargs, + **{ + **output_kwargs["text_kwargs"], + "return_tensors": None, # required to concatenate below + }, ) # if we know how many query tokens, expand text inside processor. We need this hacky manipulation @@ -145,31 +150,16 @@ def __call__( ) # cast to desired return tensors type after concatenating - text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors) - encoding.update(text_encoding) - qformer_text_encoding = self.qformer_tokenizer( - text=text, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - pad_to_multiple_of=pad_to_multiple_of, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_token_type_ids=return_token_type_ids, - return_length=return_length, - verbose=verbose, - return_tensors=return_tensors, - **kwargs, + text_encoding = BatchEncoding( + text_encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors") ) + encoding.update(text_encoding) + qformer_text_encoding = self.qformer_tokenizer(text=text, **output_kwargs["text_kwargs"]) encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids") encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask") if images is not None: - image_encoding = self.image_processor(images, return_tensors=return_tensors) + image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"]) encoding.update(image_encoding) return encoding diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index f744b9fcf9c1cd..f8237d0078bf0a 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -720,7 +720,11 @@ def preprocess( image_patches = self.get_image_patches( image, image_grid_pinpoints, - size=(size["shortest_edge"], size["shortest_edge"]), + size=( + (size["shortest_edge"], size["shortest_edge"]) + if "shortest_edge" in size + else (size["height"], size["width"]) + ), patch_size=crop_size["height"], resample=resample, data_format=input_data_format, diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index efbb193ba62a9f..e693ce265ef1e6 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -16,13 +16,20 @@ Processor class for LLaVa-NeXT-Video. """ +import sys from typing import TYPE_CHECKING, List, Optional, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from ...utils import TensorType, logging +from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack if TYPE_CHECKING: @@ -31,6 +38,17 @@ logger = logging.get_logger(__name__) +class LlavaNextVideoProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "common_kwargs": { + "return_tensors": "pt", + }, + } + + class LlavaNextVideoProcessor(ProcessorMixin): r""" Constructs a LLaVa-NeXT-Video processor which wraps a LLaVa-NeXT image processor, LLaVa-NeXT-Video video processor and @@ -88,12 +106,10 @@ def __init__( def __call__( self, text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], - images: ImageInput = None, - videos: VideoInput = None, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length: int = None, - return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, + images: Optional[ImageInput] = None, + videos: Optional[VideoInput] = None, + audio=None, + **kwargs: Unpack[LlavaNextVideoProcessorKwargs], ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` @@ -115,26 +131,6 @@ def __call__( videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding - index) among: - - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single - sequence if provided). - - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum - acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different - lengths). - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding length (see above). - truncation (`bool`, *optional*): - Activates truncation to cut input sequences longer than `max_length` to `max_length`. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: @@ -145,13 +141,19 @@ def __call__( `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + output_kwargs = self._merge_kwargs( + LlavaNextVideoProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if images is not None: - image_inputs = self.image_processor(images, return_tensors=return_tensors) + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) else: image_inputs = {} if videos is not None: - videos_inputs = self.video_processor(videos, return_tensors=return_tensors) + videos_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"]) else: videos_inputs = {} @@ -203,14 +205,7 @@ def __call__( sample = sample.replace(self.video_token, self.video_token * num_video_tokens) prompt_strings.append(sample) - text_inputs = self.tokenizer( - prompt_strings, - return_tensors=return_tensors, - padding=padding, - truncation=truncation, - max_length=max_length, - ) - print(text_inputs.keys()) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}) diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py index 655fb4d4f78ab0..eef71e33424d8e 100644 --- a/src/transformers/models/siglip/processing_siglip.py +++ b/src/transformers/models/siglip/processing_siglip.py @@ -16,13 +16,30 @@ Image/Text processor class for SigLIP. """ +import sys from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from ...utils import TensorType +from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class SiglipProcessingKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "common_kwargs": { + "return_tensors": "pt", + }, + } class SiglipProcessor(ProcessorMixin): @@ -48,12 +65,9 @@ def __init__(self, image_processor, tokenizer): def __call__( self, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - images: ImageInput = None, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length: int = None, - return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + images: Optional[ImageInput] = None, + **kwargs: Unpack[SiglipProcessingKwargs], ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` @@ -70,26 +84,6 @@ def __call__( images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding - index) among: - - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single - sequence if provided). - - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum - acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different - lengths). - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding length (see above). - truncation (`bool`, *optional*): - Activates truncation to cut input sequences longer than `max_length` to `max_length`. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: @@ -104,13 +98,17 @@ def __call__( if text is None and images is None: raise ValueError("You have to specify either text or images. Both cannot be none.") + output_kwargs = self._merge_kwargs( + SiglipProcessingKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if text is not None: - encoding = self.tokenizer( - text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length - ) + encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) if images is not None: - image_features = self.image_processor(images, return_tensors=return_tensors) + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values @@ -118,7 +116,9 @@ def __call__( elif text is not None: return encoding else: - return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) + return BatchFeature( + data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"] + ) def decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index a06913d7acf760..774c4003f3cb0f 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -16,18 +16,36 @@ Processor class for VideoLlava. """ +import sys from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, get_image_size, to_numpy_array -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from ...utils import TensorType, logging +from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils import logging + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack logger = logging.get_logger(__name__) +class VideoLlavaProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "common_kwargs": { + "return_tensors": "pt", + }, + } + + class VideoLlavaProcessor(ProcessorMixin): r""" Constructs a VideoLlava processor which wraps a VideoLlava image processor and a Llava tokenizer into a single processor. @@ -77,13 +95,11 @@ def __init__( def __call__( self, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - images: ImageInput = None, - videos: ImageInput = None, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length=None, - return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + images: Optional[ImageInput] = None, + videos: Optional[ImageInput] = None, + audio=None, + **kwargs: Unpack[VideoLlavaProcessorKwargs], ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` @@ -105,26 +121,6 @@ def __call__( Video frames to preprocess. Expects a single or batch of video frames in NumPy array or PyTorch tensor. Each video should be of shape (T, C, H, W), where T is number of frames, C is number of channels, H and W are image height and width. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): - Select a strategy to pad the returned sequences (according to the model's padding side and padding - index) among: - - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single - sequence if provided). - - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum - acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different - lengths). - max_length (`int`, *optional*): - Maximum length of the returned list and optionally padding length (see above). - truncation (`bool`, *optional*): - Activates truncation to cut input sequences longer than `max_length` to `max_length`. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: @@ -135,9 +131,17 @@ def __call__( `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + output_kwargs = self._merge_kwargs( + VideoLlavaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + # Temporary fix for "paddding_side" in init_kwargs + _ = output_kwargs["text_kwargs"].pop("padding_side", None) + data = {} if images is not None or videos is not None: - encoded_images = self.image_processor(images=images, videos=videos, return_tensors=return_tensors) + encoded_images = self.image_processor(images=images, videos=videos, **output_kwargs["images_kwargs"]) data.update(encoded_images) if isinstance(text, str): @@ -174,13 +178,7 @@ def __call__( sample = sample.replace(self.video_token, self.video_token * num_video_tokens) prompt_strings.append(sample) - text_inputs = self.tokenizer( - prompt_strings, - return_tensors=return_tensors, - padding=padding, - truncation=truncation, - max_length=max_length, - ) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) data.update(text_inputs) return BatchFeature(data=data) diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py index 66ffeb816fec5e..f2c3529218e257 100644 --- a/src/transformers/models/vilt/image_processing_vilt.py +++ b/src/transformers/models/vilt/image_processing_vilt.py @@ -112,8 +112,8 @@ def get_resize_output_image_size( new_width = scale * new_width new_height, new_width = int(new_height + 0.5), int(new_width + 0.5) - new_height = new_height // size_divisor * size_divisor - new_width = new_width // size_divisor * size_divisor + new_height = max(1, new_height // size_divisor) * size_divisor + new_width = max(1, new_width // size_divisor) * size_divisor return new_height, new_width @@ -236,9 +236,7 @@ def resize( The channel dimension format of the input image. If not provided, it will be inferred. """ size = get_size_dict(size, default_to_square=False) - if "shortest_edge" not in size: - raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}") - shorter = size["shortest_edge"] + shorter = size["shortest_edge"] if "shortest_edge" in size else min(size["height"], size["width"]) longer = int(1333 / 800 * shorter) output_size = get_resize_output_image_size( image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py index 0ccb884ea00c9d..46e18b3ff6bd88 100644 --- a/src/transformers/models/vilt/processing_vilt.py +++ b/src/transformers/models/vilt/processing_vilt.py @@ -16,12 +16,34 @@ Processor class for ViLT. """ +import sys import warnings from typing import List, Optional, Union -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy -from ...utils import TensorType +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class ViltProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "stride": 0, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_length": False, + "verbose": True, + }, + } class ViltProcessor(ProcessorMixin): @@ -63,23 +85,9 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): def __call__( self, - images, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - add_special_tokens: bool = True, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length: Optional[int] = None, - stride: int = 0, - pad_to_multiple_of: Optional[int] = None, - return_token_type_ids: Optional[bool] = None, - return_attention_mask: Optional[bool] = None, - return_overflowing_tokens: bool = False, - return_special_tokens_mask: bool = False, - return_offsets_mapping: bool = False, - return_length: bool = False, - verbose: bool = True, - return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs, + images: ImageInput, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + **kwargs: Unpack[ViltProcessorKwargs], ) -> BatchEncoding: """ This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and @@ -87,26 +95,15 @@ def __call__( Please refer to the docstring of the above two methods for more information. """ - encoding = self.tokenizer( - text=text, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - pad_to_multiple_of=pad_to_multiple_of, - return_token_type_ids=return_token_type_ids, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_length=return_length, - verbose=verbose, - return_tensors=return_tensors, + output_kwargs = self._merge_kwargs( + ViltProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) + + encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) # add pixel_values + pixel_mask - encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) + encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"]) encoding.update(encoding_image_processor) return encoding diff --git a/tests/models/altclip/test_processor_altclip.py b/tests/models/altclip/test_processor_altclip.py new file mode 100644 index 00000000000000..86a84ae9ab8bc6 --- /dev/null +++ b/tests/models/altclip/test_processor_altclip.py @@ -0,0 +1,16 @@ +import tempfile +import unittest + +from transformers import AltCLIPProcessor + +from ...test_processing_common import ProcessorTesterMixin + + +class AltCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "BAAI/AltCLIP" + processor_class = AltCLIPProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) diff --git a/tests/models/flava/test_processor_flava.py b/tests/models/flava/test_processor_flava.py index a83e459153d532..56a52ee21c7b07 100644 --- a/tests/models/flava/test_processor_flava.py +++ b/tests/models/flava/test_processor_flava.py @@ -22,16 +22,18 @@ import numpy as np import pytest -from transformers import BertTokenizer, BertTokenizerFast +from transformers import BertTokenizer, BertTokenizerFast, FlavaProcessor from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES from transformers.testing_utils import require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_vision_available(): from PIL import Image - from transformers import FlavaImageProcessor, FlavaProcessor + from transformers import FlavaImageProcessor from transformers.models.flava.image_processing_flava import ( FLAVA_CODEBOOK_MEAN, FLAVA_CODEBOOK_STD, @@ -41,7 +43,9 @@ @require_vision -class FlavaProcessorTest(unittest.TestCase): +class FlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = FlavaProcessor + def setUp(self): self.tmpdirname = tempfile.mkdtemp() diff --git a/tests/models/instructblipvideo/test_processor_instructblipvideo.py b/tests/models/instructblipvideo/test_processor_instructblipvideo.py new file mode 100644 index 00000000000000..07c81f1f649651 --- /dev/null +++ b/tests/models/instructblipvideo/test_processor_instructblipvideo.py @@ -0,0 +1,21 @@ +import tempfile +import unittest + +from transformers import InstructBlipVideoProcessor + +from ...test_processing_common import ProcessorTesterMixin + + +class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "Salesforce/instructblip-vicuna-7b" + processor_class = InstructBlipVideoProcessor + + def prepare_components(self): + components = super().prepare_components() + components["qformer_tokenizer"] = components["tokenizer"] + return components + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) diff --git a/tests/models/llava_next_video/test_processor_llava_next_video.py b/tests/models/llava_next_video/test_processor_llava_next_video.py new file mode 100644 index 00000000000000..9cd4615d572547 --- /dev/null +++ b/tests/models/llava_next_video/test_processor_llava_next_video.py @@ -0,0 +1,16 @@ +import tempfile +import unittest + +from transformers import LlavaNextVideoProcessor + +from ...test_processing_common import ProcessorTesterMixin + + +class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "llava-hf/LLaVA-NeXT-Video-7B-hf" + processor_class = LlavaNextVideoProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) diff --git a/tests/models/siglip/test_processor_siglip.py b/tests/models/siglip/test_processor_siglip.py new file mode 100644 index 00000000000000..608ff70539a218 --- /dev/null +++ b/tests/models/siglip/test_processor_siglip.py @@ -0,0 +1,16 @@ +import tempfile +import unittest + +from transformers import SiglipProcessor + +from ...test_processing_common import ProcessorTesterMixin + + +class SiglipProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "google/siglip-base-patch16-224" + processor_class = SiglipProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) diff --git a/tests/models/video_llava/test_processor_video_llava.py b/tests/models/video_llava/test_processor_video_llava.py new file mode 100644 index 00000000000000..9ddc84a6bcb944 --- /dev/null +++ b/tests/models/video_llava/test_processor_video_llava.py @@ -0,0 +1,17 @@ +import tempfile +import unittest + +from transformers.models.video_llava.processing_video_llava import VideoLlavaProcessor + +from ...test_processing_common import ProcessorTesterMixin + + +class VideoLlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "LanguageBind/Video-LLaVA-7B-hf" + processor_class = VideoLlavaProcessor + images_data_arg_name = "pixel_values_images" + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) diff --git a/tests/models/vilt/test_processor_vilt.py b/tests/models/vilt/test_processor_vilt.py new file mode 100644 index 00000000000000..0ae6a5256d1b32 --- /dev/null +++ b/tests/models/vilt/test_processor_vilt.py @@ -0,0 +1,178 @@ +import tempfile +import unittest + +from transformers import ViltProcessor +from transformers.testing_utils import require_torch, require_vision + +from ...test_processing_common import ProcessorTesterMixin + + +class ViltProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "dandelin/vilt-b32-mlm" + processor_class = ViltProcessor + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) + + @require_torch + @require_vision + def test_image_processor_defaults_preserved_by_image_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component( + "image_processor", size=(234, 234), crop_size=(234, 234), size_divisor=32 + ) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + # VILT resizes images to dims divisible by size_divisor + vilt_compatible_image_size = (32, 384) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1]) + + @require_torch + @require_vision + def test_kwargs_overrides_default_image_processor_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component("image_processor", size=(234, 234)) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + # VILT resizes images to dims divisible by size_divisor + vilt_compatible_image_size = (32, 352) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor( + text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224), return_tensors="pt" + ) + + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1]) + + @require_torch + @require_vision + def test_structured_kwargs_nested(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + # VILT resizes images to dims divisible by size_divisor + vilt_compatible_image_size = (32, 352) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": { + "size": {"height": 214, "width": 214}, + "crop_size": {"height": 214, "width": 214}, + }, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.skip_processor_without_typed_kwargs(processor) + + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1]) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) + + @require_torch + @require_vision + def test_structured_kwargs_nested_from_dict(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + # VILT resizes images to dims divisible by size_divisor + vilt_compatible_image_size = (32, 352) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": { + "size": {"height": 214, "width": 214}, + "crop_size": {"height": 214, "width": 214}, + }, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1]) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) + + @require_torch + @require_vision + def test_unstructured_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + # VILT resizes images to dims divisible by size_divisor + vilt_compatible_image_size = (32, 352) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + size={"height": 214, "width": 214}, + crop_size={"height": 214, "width": 214}, + padding="max_length", + max_length=76, + ) + + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1]) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) + + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + # VILT resizes images to dims divisible by size_divisor + vilt_compatible_image_size = (32, 352) + + input_str = ["lower newer", "upper older longer string"] + image_input = self.prepare_image_inputs() * 2 + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + size={"height": 214, "width": 214}, + crop_size={"height": 214, "width": 214}, + padding="longest", + max_length=76, + ) + + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], vilt_compatible_image_size[-1]) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), len(inputs[self.text_data_arg_name][1])) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index ec1e211872e667..bad0eb8cd6b72b 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -26,6 +26,7 @@ import unittest import numpy as np +from huggingface_hub import hf_hub_download from transformers import CLIPTokenizerFast, ProcessorMixin from transformers.models.auto.processing_auto import processor_class_from_name @@ -51,6 +52,7 @@ class ProcessorTesterMixin: processor_class = None text_data_arg_name = "input_ids" images_data_arg_name = "pixel_values" + videos_data_arg_name = "pixel_values_videos" def prepare_processor_dict(self): return {} @@ -90,6 +92,13 @@ def prepare_image_inputs(self): image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] return image_inputs + @require_vision + def prepare_video_inputs(self): + video_file = hf_hub_download( + repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" + ) + return [np.load(video_file)] + def test_processor_to_json_string(self): processor = self.get_processor() obj = json.loads(processor.to_json_string()) @@ -129,43 +138,69 @@ def skip_processor_without_typed_kwargs(self, processor): def test_tokenizer_defaults_preserved_by_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + processor_components = self.prepare_components() + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 117) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 117) @require_torch @require_vision def test_image_processor_defaults_preserved_by_image_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component( + "image_processor", size=(234, 234), crop_size=(234, 234) + ) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 234) + + @require_torch + @require_vision + def test_video_processor_defaults_preserved_by_kwargs(self): + if "video_processor" not in self.processor_class.attributes: + self.skipTest(f"video_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor", size=(234, 234), crop_size=(234, 234)) + video_processor = self.get_component("video_processor", size=(234, 234), crop_size=(234, 234)) tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class( + tokenizer=tokenizer, + image_processor=image_processor, + video_processor=video_processor, + ) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() + video_input = self.prepare_video_inputs() - inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs[self.images_data_arg_name][0][0]), 234) + inputs = processor(text=input_str, images=image_input, videos=video_input, return_tensors="pt") + self.assertEqual(inputs[self.videos_data_arg_name].shape[-1], 234) @require_vision @require_torch def test_kwargs_overrides_default_tokenizer_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", padding="longest") + processor_components = self.prepare_components() + processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -173,34 +208,35 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): inputs = processor( text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" ) - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 112) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 112) @require_torch @require_vision def test_kwargs_overrides_default_image_processor_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component("image_processor", size=(234, 234)) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224)) - self.assertEqual(len(inputs[self.images_data_arg_name][0][0]), 224) + inputs = processor( + text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224), return_tensors="pt" + ) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 224) @require_torch @require_vision def test_unstructured_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" @@ -215,18 +251,16 @@ def test_unstructured_kwargs(self): max_length=76, ) - self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214) - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) @require_torch @require_vision def test_unstructured_kwargs_batched(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer", "upper older longer string"] @@ -241,19 +275,16 @@ def test_unstructured_kwargs_batched(self): max_length=76, ) - self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214) - - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 6) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) + self.assertEqual(len(inputs[self.text_data_arg_name][0]), len(inputs[self.text_data_arg_name][1])) @require_torch @require_vision def test_doubly_passed_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer"] @@ -265,6 +296,7 @@ def test_doubly_passed_kwargs(self): images_kwargs={"size": {"height": 222, "width": 222}}, size={"height": 214, "width": 214}, crop_size={"height": 214, "width": 214}, + return_tensors="pt", ) @require_torch @@ -272,10 +304,8 @@ def test_doubly_passed_kwargs(self): def test_structured_kwargs_nested(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" @@ -294,21 +324,18 @@ def test_structured_kwargs_nested(self): inputs = processor(text=input_str, images=image_input, **all_kwargs) self.skip_processor_without_typed_kwargs(processor) - self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214) - - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) @require_torch @require_vision def test_structured_kwargs_nested_from_dict(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -323,9 +350,8 @@ def test_structured_kwargs_nested_from_dict(self): } inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs[self.images_data_arg_name].shape[2], 214) - - self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76) + self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) + self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) class MyProcessor(ProcessorMixin): From d3259142deb471abab6e394283d75f9cadf53ccf Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Sat, 17 Aug 2024 14:26:05 +0800 Subject: [PATCH 09/25] fix linter issue --- .../models/bridgetower/image_processing_bridgetower.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 49905d36b33518..b9d0d41377bfde 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -238,9 +238,7 @@ def resize( The channel dimension format of the input image. If not provided, it will be inferred. """ size = get_size_dict(size, default_to_square=False) - if "shortest_edge" not in size: - raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}") - shorter = size["shortest_edge"] + shorter = size["shortest_edge"] if "shortest_edge" in size else min(size["height"], size["width"]) longer = int(1333 / 800 * shorter) output_size = get_resize_output_image_size( image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format From 935d6e51d470861484f0fd2ec1ea5e9a6982a6d2 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Mon, 19 Aug 2024 16:43:00 +0800 Subject: [PATCH 10/25] address @zucchini-nlp's comments --- .../models/altclip/processing_altclip.py | 16 ++-------- .../models/chameleon/processing_chameleon.py | 10 +++--- .../models/flava/processing_flava.py | 8 ++--- .../processing_instructblipvideo.py | 32 +++++++++++++------ .../processing_llava_next_video.py | 10 +++--- .../models/siglip/processing_siglip.py | 6 ++-- .../video_llava/processing_video_llava.py | 6 ++-- .../models/vilt/processing_vilt.py | 26 +++++++++++++-- 8 files changed, 69 insertions(+), 45 deletions(-) diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index af9b0aee5930a6..63168c7dd205bc 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -22,7 +22,7 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput +from ...tokenization_utils_base import PreTokenizedInput, TextInput if sys.version_info >= (3, 11): @@ -94,16 +94,8 @@ def __call__( The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. - Returns: - [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when @@ -133,9 +125,7 @@ def __call__( elif text is not None: return encoding else: - return BatchEncoding( - data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"] - ) + return image_features def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index d999267ef1fa9c..d9fbe965034e59 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -114,7 +114,7 @@ def __call__( elif not isinstance(text, list) and not isinstance(text[0], str): raise TypeError("Invalid input text. Please provide a string, or a list of strings") if text is None and images is None: - raise ValueError("You must provide either text or images") + raise ValueError("You must provide either text or images as prompt") output_kwargs = self._merge_kwargs( ChameleonProcessorKwargs, @@ -132,12 +132,10 @@ def __call__( sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode prompt_strings.append(sample) - data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) - + features = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) if images is not None: - data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] - - return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"]) + features["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] + return features # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index e98df78f6034d5..b62fc77fc38746 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -22,7 +22,7 @@ from ...image_utils import ImageInput from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput +from ...tokenization_utils_base import PreTokenizedInput, TextInput if sys.version_info >= (3, 11): @@ -92,6 +92,8 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + audio=None, + videos=None, **kwargs: Unpack[FlavaProcessorKwargs], ): """ @@ -121,9 +123,7 @@ def __call__( elif text is not None: return encoding else: - return BatchEncoding( - data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"] - ) + return image_features def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index c38edc2048549f..58f2fc565a7fd6 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -20,15 +20,10 @@ import sys from typing import List, Optional, Union -from ...image_processing_utils import BatchFeature +from ...feature_extraction_utils import BatchFeature from ...image_utils import VideoInput from ...processing_utils import ProcessingKwargs, ProcessorMixin -from ...tokenization_utils_base import ( - AddedToken, - BatchEncoding, - PreTokenizedInput, - TextInput, -) +from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput from ...utils import logging from ..auto import AutoTokenizer @@ -103,7 +98,26 @@ def __call__( This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and [`BertTokenizerFast.__call__`] to prepare text for the model. - Please refer to the docstring of the above two methods for more information. + Args: + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + -- **qformer_input_ids** - List of token ids from the Q-Former tokenizer to be fed to a model. Returned when `text` is not `None`. + -- **qformer_attention_mask** - List of indices specifying which tokens from the Q-Former tokenizer should be attended to by the model. Returned when `text` is not `None`. + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ output_kwargs = self._merge_kwargs( InstructBlipVideoProcessorKwargs, @@ -150,7 +164,7 @@ def __call__( ) # cast to desired return tensors type after concatenating - text_encoding = BatchEncoding( + text_encoding = BatchFeature( text_encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors") ) encoding.update(text_encoding) diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index e693ce265ef1e6..d11512ae7ff8f8 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -139,7 +139,8 @@ def __call__( - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`). - - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **pixel_values_images** -- Pixel values of images to be fed to a model. Returned when `images` is not `None`. + - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`. """ output_kwargs = self._merge_kwargs( LlavaNextVideoProcessorKwargs, @@ -162,8 +163,6 @@ def __call__( elif not isinstance(text, list) and not isinstance(text[0], str): raise ValueError("Invalid input text. Please provide a string, or a list of strings") - print(self.patch_size, self.vision_feature_select_strategy, image_inputs, videos_inputs.keys()) - if self.patch_size is None or self.vision_feature_select_strategy is None: prompt_strings = text logger.warning_once( @@ -207,7 +206,10 @@ def __call__( text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) - return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}) + return BatchFeature( + data={**text_inputs, **image_inputs, **videos_inputs}, + tensor_type=output_kwargs["common_kwargs"].get("return_tensors"), + ) # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py index eef71e33424d8e..cd7391f7bff395 100644 --- a/src/transformers/models/siglip/processing_siglip.py +++ b/src/transformers/models/siglip/processing_siglip.py @@ -67,6 +67,8 @@ def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, images: Optional[ImageInput] = None, + audio=None, + videos=None, **kwargs: Unpack[SiglipProcessingKwargs], ) -> BatchFeature: """ @@ -116,9 +118,7 @@ def __call__( elif text is not None: return encoding else: - return BatchFeature( - data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"] - ) + return image_features def decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index 774c4003f3cb0f..61ebfbc40831de 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -20,7 +20,7 @@ from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature -from ...image_utils import ImageInput, get_image_size, to_numpy_array +from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array from ...processing_utils import ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging @@ -97,7 +97,7 @@ def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, images: Optional[ImageInput] = None, - videos: Optional[ImageInput] = None, + videos: Optional[VideoInput] = None, audio=None, **kwargs: Unpack[VideoLlavaProcessorKwargs], ) -> BatchFeature: @@ -181,7 +181,7 @@ def __call__( text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) data.update(text_inputs) - return BatchFeature(data=data) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py index 46e18b3ff6bd88..91f466e1079e1a 100644 --- a/src/transformers/models/vilt/processing_vilt.py +++ b/src/transformers/models/vilt/processing_vilt.py @@ -20,9 +20,10 @@ import warnings from typing import List, Optional, Union +from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput +from ...tokenization_utils_base import PreTokenizedInput, TextInput if sys.version_info >= (3, 11): @@ -87,13 +88,32 @@ def __call__( self, images: ImageInput, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + audio=None, + videos=None, **kwargs: Unpack[ViltProcessorKwargs], - ) -> BatchEncoding: + ) -> BatchFeature: """ This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and [`BertTokenizerFast.__call__`] to prepare text for the model. - Please refer to the docstring of the above two methods for more information. + Args: + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ output_kwargs = self._merge_kwargs( ViltProcessorKwargs, From 39650f60c7b5d4808dd54aa6acbd38faee61bf0f Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Mon, 19 Aug 2024 16:57:04 +0800 Subject: [PATCH 11/25] improve docs --- .../models/altclip/processing_altclip.py | 6 ++++-- .../models/chameleon/processing_chameleon.py | 4 ++-- .../models/flava/processing_flava.py | 19 ++++++++++++++++++- .../processing_instructblipvideo.py | 3 ++- .../processing_llava_next_video.py | 6 +++--- .../models/siglip/processing_siglip.py | 4 ++-- .../video_llava/processing_video_llava.py | 6 +++--- .../models/vilt/processing_vilt.py | 2 +- 8 files changed, 35 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index 63168c7dd205bc..c15b18d029c206 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -76,6 +76,8 @@ def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, images: Optional[ImageInput] = None, + audio=None, + videos=None, **kwargs: Unpack[AltCLIPProcessingKwargs], ): """ @@ -86,11 +88,11 @@ def __call__( of the above two methods for more information. Args: - text (`str`, `List[str]`, `List[List[str]]`): + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + images (`ImageInput`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index d9fbe965034e59..14d759ec6dcf87 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -92,11 +92,11 @@ def __call__( of the above two methods for more information. Args: - text (`str`, `List[str]`, `List[List[str]]`): + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + images (`ImageInput`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index b62fc77fc38746..c3a3fe12046660 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -100,7 +100,24 @@ def __call__( This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and [`BertTokenizerFast.__call__`] to prepare text for the model. - Please refer to the docstring of the above two methods for more information. + Args: + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`ImageInput`, *optional*): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ if text is None and images is None: diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index 58f2fc565a7fd6..1855dd4db4e58e 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -103,7 +103,7 @@ def __call__( The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + images (`ImageInput`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. @@ -119,6 +119,7 @@ def __call__( -- **qformer_attention_mask** - List of indices specifying which tokens from the Q-Former tokenizer should be attended to by the model. Returned when `text` is not `None`. - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + output_kwargs = self._merge_kwargs( InstructBlipVideoProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index d11512ae7ff8f8..ad85ff8d15e266 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -121,14 +121,14 @@ def __call__( of the above two methods for more information. Args: - text (`str`, `List[str]`, `List[List[str]]`): + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + images (`ImageInput`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. - videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`): + videos (`VideoInput`, *optional*): The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py index cd7391f7bff395..265832d840daca 100644 --- a/src/transformers/models/siglip/processing_siglip.py +++ b/src/transformers/models/siglip/processing_siglip.py @@ -79,11 +79,11 @@ def __call__( of the above two methods for more information. Args: - text (`str`, `List[str]`, `List[List[str]]`): + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + images (`ImageInput`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index 61ebfbc40831de..49c103fabe9729 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -109,15 +109,15 @@ def __call__( of the above two methods for more information. Args: - text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + images (`ImageInput`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. - videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`): + videos (`VideoInput`, *optional*): Video frames to preprocess. Expects a single or batch of video frames in NumPy array or PyTorch tensor. Each video should be of shape (T, C, H, W), where T is number of frames, C is number of channels, H and W are image height and width. diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py index 91f466e1079e1a..a282ebe2c9e3da 100644 --- a/src/transformers/models/vilt/processing_vilt.py +++ b/src/transformers/models/vilt/processing_vilt.py @@ -101,7 +101,7 @@ def __call__( The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + images (`ImageInput`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. From 539da9dde154645558c2daaafc7bf83f01fbd2c8 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Mon, 19 Aug 2024 17:05:53 +0800 Subject: [PATCH 12/25] don't dw from hub for video tests --- tests/test_processing_common.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index bad0eb8cd6b72b..ec77e1fed7fa70 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -26,7 +26,6 @@ import unittest import numpy as np -from huggingface_hub import hf_hub_download from transformers import CLIPTokenizerFast, ProcessorMixin from transformers.models.auto.processing_auto import processor_class_from_name @@ -94,10 +93,7 @@ def prepare_image_inputs(self): @require_vision def prepare_video_inputs(self): - video_file = hf_hub_download( - repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset" - ) - return [np.load(video_file)] + return [np.random.randint(255, size=(4, 3, 30, 400), dtype=np.uint8)] def test_processor_to_json_string(self): processor = self.get_processor() From c8b2384b00de01eaaf34a1f7d0afa68cae623ed1 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Mon, 19 Aug 2024 17:30:42 +0800 Subject: [PATCH 13/25] add video processing tests for instructblipvideo & video_llava --- .../processing_instructblipvideo.py | 27 ++++++++++++++----- .../test_processor_instructblipvideo.py | 1 + tests/test_processing_common.py | 25 +++++++++-------- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index 1855dd4db4e58e..2a60f7ab3ef07a 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -18,6 +18,7 @@ import os import sys +import warnings from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -88,10 +89,10 @@ def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, num_query def __call__( self, - images: Optional[VideoInput] = None, + images: Optional[VideoInput] = None, # Keeping this here for backwards compatibility text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + videos: Optional[VideoInput] = None, audio=None, - videos=None, **kwargs: Unpack[InstructBlipVideoProcessorKwargs], ) -> BatchFeature: """ @@ -107,6 +108,9 @@ def __call__( The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. + videos (`VideoInput`, *optional*): + The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch + tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: @@ -119,6 +123,17 @@ def __call__( -- **qformer_attention_mask** - List of indices specifying which tokens from the Q-Former tokenizer should be attended to by the model. Returned when `text` is not `None`. - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + if images is not None: + warnings.warn( + "The `images` argument is deprecated and will be removed in future versions, use `videos` instead.", + FutureWarning, + ) + if images is not None and videos is not None: + raise ValueError( + "You cannot provide both `images` and `videos` at the same time. Please pass video data as `videos=...` instead." + ) + if images is not None and videos is None: + videos = images output_kwargs = self._merge_kwargs( InstructBlipVideoProcessorKwargs, @@ -144,7 +159,7 @@ def __call__( # if we know how many query tokens, expand text inside processor. We need this hacky manipulation # because BLIP expects image tokens to be at the beginning even before BOS token - if self.num_query_tokens is not None and images is not None: + if self.num_query_tokens is not None and videos is not None: text_encoding = {} video_tokens = ( self.video_token.content * self.num_query_tokens * 4 @@ -157,7 +172,7 @@ def __call__( ] else: text_encoding = _text_encoding - if images is not None: + if videos is not None: logger.warning_once( "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. " @@ -173,8 +188,8 @@ def __call__( encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids") encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask") - if images is not None: - image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"]) + if videos is not None: + image_encoding = self.image_processor(videos, **output_kwargs["images_kwargs"]) encoding.update(image_encoding) return encoding diff --git a/tests/models/instructblipvideo/test_processor_instructblipvideo.py b/tests/models/instructblipvideo/test_processor_instructblipvideo.py index 07c81f1f649651..9442a429944226 100644 --- a/tests/models/instructblipvideo/test_processor_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_processor_instructblipvideo.py @@ -9,6 +9,7 @@ class InstructBlipVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase): from_pretrained_id = "Salesforce/instructblip-vicuna-7b" processor_class = InstructBlipVideoProcessor + videos_data_arg_name = "pixel_values" def prepare_components(self): components = super().prepare_components() diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index ec77e1fed7fa70..ebb5e6f74f3d07 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -168,24 +168,27 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): @require_torch @require_vision def test_video_processor_defaults_preserved_by_kwargs(self): - if "video_processor" not in self.processor_class.attributes: + if "video_processor" not in self.processor_class.attributes and ( + "videos" not in inspect.signature(self.processor_class.__call__).parameters + or inspect.signature(self.processor_class.__call__).parameters["videos"].annotation == inspect._empty + ): self.skipTest(f"video_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size=(234, 234), crop_size=(234, 234)) - video_processor = self.get_component("video_processor", size=(234, 234), crop_size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class( - tokenizer=tokenizer, - image_processor=image_processor, - video_processor=video_processor, + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component( + "image_processor", size=(234, 234), crop_size=(234, 234) ) + if "video_processor" in self.processor_class.attributes: + processor_components["video_processor"] = self.get_component( + "video_processor", size=(234, 234), crop_size=(234, 234) + ) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" - image_input = self.prepare_image_inputs() video_input = self.prepare_video_inputs() - inputs = processor(text=input_str, images=image_input, videos=video_input, return_tensors="pt") + inputs = processor(text=input_str, videos=video_input, return_tensors="pt") self.assertEqual(inputs[self.videos_data_arg_name].shape[-1], 234) @require_vision From 423d8645b1583102e986f76503fdbbb7d9a3b2fb Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Mon, 19 Aug 2024 21:08:50 +0800 Subject: [PATCH 14/25] add git, mgp, tvp, & x-clip --- .../models/altclip/processing_altclip.py | 17 ++-- .../models/flava/processing_flava.py | 13 +-- src/transformers/models/git/processing_git.py | 66 ++++++++------ .../models/mgp_str/processing_mgp_str.py | 87 +++++++++++++++---- .../models/siglip/processing_siglip.py | 16 ++-- .../models/tvp/image_processing_tvp.py | 16 ++++ src/transformers/models/tvp/processing_tvp.py | 74 ++++++++++------ .../models/vilt/processing_vilt.py | 3 +- .../models/x_clip/processing_x_clip.py | 55 ++++++++---- tests/models/git/test_processor_git.py | 6 +- .../models/mgp_str/test_processor_mgp_str.py | 22 ++--- tests/models/tvp/test_processor_tvp.py | 81 +++++++++++++++++ tests/models/x_clip/test_processor_x_clip.py | 54 ++++++++++++ 13 files changed, 387 insertions(+), 123 deletions(-) create mode 100644 tests/models/tvp/test_processor_tvp.py create mode 100644 tests/models/x_clip/test_processor_x_clip.py diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index c15b18d029c206..9d01f96afac5e3 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -20,6 +20,7 @@ import warnings from typing import List, Optional, Union +from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -31,7 +32,7 @@ from typing_extensions import Unpack -class AltCLIPProcessingKwargs(ProcessingKwargs, total=False): +class AltCLIPProcessorKwargs(ProcessingKwargs, total=False): _defaults = {} @@ -78,8 +79,8 @@ def __call__( images: Optional[ImageInput] = None, audio=None, videos=None, - **kwargs: Unpack[AltCLIPProcessingKwargs], - ): + **kwargs: Unpack[AltCLIPProcessorKwargs], + ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not @@ -110,7 +111,7 @@ def __call__( raise ValueError("You have to specify either text or images. Both cannot be none.") output_kwargs = self._merge_kwargs( - AltCLIPProcessingKwargs, + AltCLIPProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) @@ -121,13 +122,13 @@ def __call__( if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + return_tensors = output_kwargs["common_kwargs"].get("return_tensors") if text is not None and images is not None: - encoding["pixel_values"] = image_features.pixel_values - return encoding + return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors) elif text is not None: - return encoding + return BatchFeature(data=dict(**encoding), tensor_type=return_tensors) else: - return image_features + return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index c3a3fe12046660..d1da077a185043 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -20,6 +20,7 @@ import warnings from typing import List, Optional, Union +from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -95,13 +96,13 @@ def __call__( audio=None, videos=None, **kwargs: Unpack[FlavaProcessorKwargs], - ): + ) -> BatchFeature: """ This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and [`BertTokenizerFast.__call__`] to prepare text for the model. Args: - text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). @@ -134,13 +135,13 @@ def __call__( if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + return_tensors = output_kwargs["common_kwargs"].get("return_tensors") if text is not None and images is not None: - encoding.update(image_features) - return encoding + return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors) elif text is not None: - return encoding + return BatchFeature(data=dict(**encoding), tensor_type=return_tensors) else: - return image_features + return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py index 98649c644e728c..3d1166e3b06d97 100644 --- a/src/transformers/models/git/processing_git.py +++ b/src/transformers/models/git/processing_git.py @@ -16,8 +16,23 @@ Image/Text processor class for GIT """ -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding +import sys +from typing import List, Optional, Union + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class GitProcessorKwargs(ProcessingKwargs, total=False): + _defaults = {} class GitProcessor(ProcessorMixin): @@ -42,7 +57,14 @@ def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) self.current_processor = self.image_processor - def __call__(self, text=None, images=None, return_tensors=None, **kwargs): + def __call__( + self, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[GitProcessorKwargs], + ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode @@ -51,24 +73,16 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs): of the above two methods for more information. Args: - text (`str`, `List[str]`, `List[List[str]]`): + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + images (`ImageInput`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. - Returns: - [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when @@ -76,29 +90,29 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs): `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ - tokenizer_kwargs, image_processor_kwargs = {}, {} - if kwargs: - tokenizer_kwargs = {k: v for k, v in kwargs.items() if k not in self.image_processor._valid_processor_keys} - image_processor_kwargs = { - k: v for k, v in kwargs.items() if k in self.image_processor._valid_processor_keys - } + + output_kwargs = self._merge_kwargs( + GitProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) if text is None and images is None: raise ValueError("You have to specify either text or images. Both cannot be none.") if text is not None: - encoding = self.tokenizer(text, return_tensors=return_tensors, **tokenizer_kwargs) + encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) if images is not None: - image_features = self.image_processor(images, return_tensors=return_tensors, **image_processor_kwargs) + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + return_tensors = output_kwargs["common_kwargs"].get("return_tensors") if text is not None and images is not None: - encoding["pixel_values"] = image_features.pixel_values - return encoding + return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors) elif text is not None: - return encoding + return BatchFeature(data=dict(**encoding), tensor_type=return_tensors) else: - return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py index 207d4230ba09b7..7e30a0336b809f 100644 --- a/src/transformers/models/mgp_str/processing_mgp_str.py +++ b/src/transformers/models/mgp_str/processing_mgp_str.py @@ -14,13 +14,24 @@ # limitations under the License. """Processor class for MGP-STR.""" +import sys import warnings +from typing import List, Optional, Union from transformers import AutoTokenizer -from transformers.utils import is_torch_available -from transformers.utils.generic import ExplicitEnum -from ...processing_utils import ProcessorMixin +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils.generic import ExplicitEnum +from ...utils.import_utils import is_torch_available + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack if is_torch_available(): @@ -36,6 +47,10 @@ class DecodeType(ExplicitEnum): SUPPORTED_ANNOTATION_FORMATS = (DecodeType.CHARACTER, DecodeType.BPE, DecodeType.WORDPIECE) +class MgpstrProcessorKwargs(ProcessingKwargs, total=False): + _defaults = {} + + class MgpstrProcessor(ProcessorMixin): r""" Constructs a MGP-STR processor which wraps an image processor and MGP-STR tokenizers into a single @@ -50,9 +65,9 @@ class MgpstrProcessor(ProcessorMixin): The tokenizer is a required input. """ - attributes = ["image_processor", "char_tokenizer"] + attributes = ["image_processor", "tokenizer"] image_processor_class = "ViTImageProcessor" - char_tokenizer_class = "MgpstrTokenizer" + tokenizer_class = "MgpstrTokenizer" def __init__(self, image_processor=None, tokenizer=None, **kwargs): feature_extractor = None @@ -63,41 +78,81 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): FutureWarning, ) feature_extractor = kwargs.pop("feature_extractor") + if "char_tokenizer" in kwargs: + warnings.warn( + "The `char_tokenizer` argument is deprecated and will be removed in future versions, use `tokenizer`" + " instead.", + FutureWarning, + ) + char_tokenizer = kwargs.pop("char_tokenizer") image_processor = image_processor if image_processor is not None else feature_extractor + tokenizer = tokenizer if tokenizer is not None else char_tokenizer if image_processor is None: raise ValueError("You need to specify an `image_processor`.") if tokenizer is None: raise ValueError("You need to specify a `tokenizer`.") - self.char_tokenizer = tokenizer + self.tokenizer = tokenizer + self.char_tokenizer = tokenizer # For backwards compatibility self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") self.wp_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") super().__init__(image_processor, tokenizer) - def __call__(self, text=None, images=None, return_tensors=None, **kwargs): + def __call__( + self, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[MgpstrProcessorKwargs], + ) -> BatchFeature: """ When used in normal mode, this method forwards all its arguments to ViTImageProcessor's [`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs` arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please refer to the doctsring of the above methods for more information. + + Args: + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`ImageInput`, *optional*): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **labels** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ if images is None and text is None: raise ValueError("You need to specify either an `images` or `text` input to process.") + output_kwargs = self._merge_kwargs( + MgpstrProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if images is not None: - inputs = self.image_processor(images, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is not None: - encodings = self.char_tokenizer(text, return_tensors=return_tensors, **kwargs) + encodings = self.tokenizer(text, **output_kwargs["text_kwargs"]) - if text is None: - return inputs - elif images is None: - return encodings + return_tensors = output_kwargs["common_kwargs"].get("return_tensors") + if text is not None and images is not None: + return BatchFeature(data=dict(**image_features, labels=encodings["input_ids"]), tensor_type=return_tensors) + elif text is not None: + return BatchFeature(data=dict(**encodings), tensor_type=return_tensors) else: - inputs["labels"] = encodings["input_ids"] - return inputs + return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) def batch_decode(self, sequences): """ @@ -201,7 +256,7 @@ def char_decode(self, sequences): Returns: `List[str]`: The list of char decoded sentences. """ - decode_strs = [seq.replace(" ", "") for seq in self.char_tokenizer.batch_decode(sequences)] + decode_strs = [seq.replace(" ", "") for seq in self.tokenizer.batch_decode(sequences)] return decode_strs def bpe_decode(self, sequences): diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py index 265832d840daca..ac3f659d3efaf9 100644 --- a/src/transformers/models/siglip/processing_siglip.py +++ b/src/transformers/models/siglip/processing_siglip.py @@ -31,7 +31,7 @@ from typing_extensions import Unpack -class SiglipProcessingKwargs(ProcessingKwargs, total=False): +class SiglipProcessorKwargs(ProcessingKwargs, total=False): _defaults = { "text_kwargs": { "padding": False, @@ -69,7 +69,7 @@ def __call__( images: Optional[ImageInput] = None, audio=None, videos=None, - **kwargs: Unpack[SiglipProcessingKwargs], + **kwargs: Unpack[SiglipProcessorKwargs], ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` @@ -101,7 +101,7 @@ def __call__( raise ValueError("You have to specify either text or images. Both cannot be none.") output_kwargs = self._merge_kwargs( - SiglipProcessingKwargs, + SiglipProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) @@ -112,13 +112,15 @@ def __call__( if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + return_tensors = output_kwargs["common_kwargs"].get("return_tensors") if text is not None and images is not None: - encoding["pixel_values"] = image_features.pixel_values - return encoding + return BatchFeature( + data=dict(**encoding, pixel_values=image_features.pixel_values), tensor_type=return_tensors + ) elif text is not None: - return encoding + return BatchFeature(data=dict(**encoding), tensor_type=return_tensors) else: - return image_features + return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) def decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 100ec133e8b026..4e9618eef17084 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -305,20 +305,30 @@ def _preprocess_image( # All transformations expect numpy arrays. image = to_numpy_array(image) + print(f"{image.shape = }") + if do_resize: image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + print(f"{image.shape = }") + if do_center_crop: image = self.center_crop(image, size=crop_size, input_data_format=input_data_format) + print(f"{image.shape = }") + if do_rescale: image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + print(f"{image.shape = }") + if do_normalize: image = self.normalize( image=image.astype(np.float32), mean=image_mean, std=image_std, input_data_format=input_data_format ) + print(f"{image.shape = }") + if do_pad: image = self.pad_image( image=image, @@ -328,12 +338,18 @@ def _preprocess_image( input_data_format=input_data_format, ) + print(f"{image.shape = }") + # the pretrained checkpoints assume images are BGR, not RGB if do_flip_channel_order: image = flip_channel_order(image=image, input_data_format=input_data_format) + print(f"{image.shape = }") + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + print(f"{image.shape = }") + return image @filter_out_non_signature_kwargs() diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py index eb8aabfdade3ed..96a85a984d8f84 100644 --- a/src/transformers/models/tvp/processing_tvp.py +++ b/src/transformers/models/tvp/processing_tvp.py @@ -16,8 +16,35 @@ Processor class for TVP. """ -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding +import sys +from typing import List, Optional, Union + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import VideoInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class TvpTextKwargs(TextKwargs, total=False): + pad_to_max_length: bool + + +class TvpProcessorKwargs(ProcessingKwargs, total=False): + text_kwargs: TvpTextKwargs + _defaults = { + "text_kwargs": { + "padding": "max_length", + "truncation": True, + "pad_to_max_length": True, + "return_token_type_ids": False, + }, + } class TvpProcessor(ProcessorMixin): @@ -46,7 +73,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) - def __call__(self, text=None, videos=None, return_tensors=None, **kwargs): + def __call__( + self, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + videos: Optional[VideoInput] = None, + images=None, + audio=None, + **kwargs: Unpack[TvpProcessorKwargs], + ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode @@ -65,16 +99,8 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs): each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of channels. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. - Returns: - [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when @@ -83,30 +109,28 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs): - **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`. """ - max_text_length = kwargs.pop("max_text_length", None) + if "max_text_length" in kwargs: + kwargs["max_length"] = kwargs.pop("max_text_length") if text is None and videos is None: raise ValueError("You have to specify either text or videos. Both cannot be none.") + output_kwargs = self._merge_kwargs( + TvpProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + encoding = {} if text is not None: - textual_input = self.tokenizer.batch_encode_plus( - text, - truncation=True, - padding="max_length", - max_length=max_text_length, - pad_to_max_length=True, - return_tensors=return_tensors, - return_token_type_ids=False, - **kwargs, - ) + textual_input = self.tokenizer(text, **output_kwargs["text_kwargs"]) encoding.update(textual_input) if videos is not None: - image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor(videos, **output_kwargs["videos_kwargs"]) encoding.update(image_features) - return BatchEncoding(data=encoding, tensor_type=return_tensors) + return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py index a282ebe2c9e3da..be8729e297a875 100644 --- a/src/transformers/models/vilt/processing_vilt.py +++ b/src/transformers/models/vilt/processing_vilt.py @@ -114,6 +114,7 @@ def __call__( `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **pixel_mask** -- Mask for the pixel values. Returned when `images` is not `None`. """ output_kwargs = self._merge_kwargs( ViltProcessorKwargs, @@ -126,7 +127,7 @@ def __call__( encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"]) encoding.update(encoding_image_processor) - return encoding + return BatchFeature(data=dict(**encoding), tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py index a11aeb18dc4f59..fa2cc860fd5d05 100644 --- a/src/transformers/models/x_clip/processing_x_clip.py +++ b/src/transformers/models/x_clip/processing_x_clip.py @@ -16,10 +16,24 @@ Image/Text processor class for XCLIP """ +import sys import warnings +from typing import List, Optional, Union -from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding +from ...feature_extraction_utils import BatchFeature +from ...image_utils import VideoInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...tokenization_utils_base import PreTokenizedInput, TextInput + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + + +class XCLIPProcessorKwargs(ProcessingKwargs, total=False): + _defaults = {} class XCLIPProcessor(ProcessorMixin): @@ -59,7 +73,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) self.current_processor = self.image_processor - def __call__(self, text=None, videos=None, return_tensors=None, **kwargs): + def __call__( + self, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + videos: Optional[VideoInput] = None, + images=None, + audio=None, + **kwargs: Unpack[XCLIPProcessorKwargs], + ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode @@ -78,16 +99,8 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs): each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of channels. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. - Returns: - [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when @@ -99,19 +112,25 @@ def __call__(self, text=None, videos=None, return_tensors=None, **kwargs): if text is None and videos is None: raise ValueError("You have to specify either text or videos. Both cannot be none.") + output_kwargs = self._merge_kwargs( + XCLIPProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if text is not None: - encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) + encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) if videos is not None: - image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor(videos, **output_kwargs["videos_kwargs"]) + return_tensors = output_kwargs["common_kwargs"].get("return_tensors") if text is not None and videos is not None: - encoding["pixel_values"] = image_features.pixel_values - return encoding + return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors) elif text is not None: - return encoding + return BatchFeature(data=dict(**encoding), tensor_type=return_tensors) else: - return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/tests/models/git/test_processor_git.py b/tests/models/git/test_processor_git.py index 95e436d8e4f526..d66260bc57483a 100644 --- a/tests/models/git/test_processor_git.py +++ b/tests/models/git/test_processor_git.py @@ -21,6 +21,8 @@ from transformers.testing_utils import require_vision from transformers.utils import is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_vision_available(): from PIL import Image @@ -29,7 +31,9 @@ @require_vision -class GitProcessorTest(unittest.TestCase): +class GitProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = GitProcessor + def setUp(self): self.tmpdirname = tempfile.mkdtemp() diff --git a/tests/models/mgp_str/test_processor_mgp_str.py b/tests/models/mgp_str/test_processor_mgp_str.py index 6a028a28424d61..a5322aa5d31435 100644 --- a/tests/models/mgp_str/test_processor_mgp_str.py +++ b/tests/models/mgp_str/test_processor_mgp_str.py @@ -20,29 +20,30 @@ import tempfile import unittest -import numpy as np import pytest -from transformers import MgpstrTokenizer +from transformers import MgpstrProcessor, MgpstrTokenizer from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES from transformers.testing_utils import require_torch, require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_torch_available(): import torch if is_vision_available(): - from PIL import Image - - from transformers import MgpstrProcessor, ViTImageProcessor + from transformers import ViTImageProcessor @require_torch @require_vision -class MgpstrProcessorTest(unittest.TestCase): +class MgpstrProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = MgpstrProcessor image_processing_class = ViTImageProcessor if is_vision_available() else None + text_data_arg_name = "labels" @property def image_processor_dict(self): @@ -79,15 +80,6 @@ def get_image_processor(self, **kwargs): def tearDown(self): shutil.rmtree(self.tmpdirname) - def prepare_image_inputs(self): - """This function prepares a list of PIL images.""" - - image_input = np.random.randint(255, size=(3, 30, 400), dtype=np.uint8) - - image_input = Image.fromarray(np.moveaxis(image_input, 0, -1)) - - return image_input - def test_save_load_pretrained_default(self): tokenizer = self.get_tokenizer() image_processor = self.get_image_processor() diff --git a/tests/models/tvp/test_processor_tvp.py b/tests/models/tvp/test_processor_tvp.py new file mode 100644 index 00000000000000..8f5e0bd6b5d05d --- /dev/null +++ b/tests/models/tvp/test_processor_tvp.py @@ -0,0 +1,81 @@ +import inspect +import tempfile +import unittest + +import numpy as np + +from transformers import TvpProcessor +from transformers.testing_utils import require_torch, require_vision + +from ...test_processing_common import ProcessorTesterMixin + + +class TvpProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "Jiqing/tiny-random-tvp" + processor_class = TvpProcessor + videos_data_arg_name = "pixel_values" + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) + + @require_vision + def prepare_video_inputs(self): + return [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] + + @require_torch + @require_vision + def test_video_processor_defaults_preserved_by_kwargs(self): + if "video_processor" not in self.processor_class.attributes and ( + "videos" not in inspect.signature(self.processor_class.__call__).parameters + or inspect.signature(self.processor_class.__call__).parameters["videos"].annotation == inspect._empty + ): + self.skipTest(f"video_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component( + "image_processor", size=(234, 234), crop_size=(234, 234), do_pad=False + ) + if "video_processor" in self.processor_class.attributes: + processor_components["video_processor"] = self.get_component( + "video_processor", size=(234, 234), crop_size=(234, 234), do_pad=False + ) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + video_input = self.prepare_video_inputs() + + inputs = processor(text=input_str, videos=video_input, return_tensors="pt") + self.assertEqual(inputs[self.videos_data_arg_name].shape[-1], 234) + + @require_torch + @require_vision + def test_image_processor_defaults_preserved_by_image_kwargs(self): + self.skipTest("TVP does not process images") + + @require_torch + @require_vision + def test_kwargs_overrides_default_image_processor_kwargs(self): + self.skipTest("TVP does not process images") + + @require_torch + @require_vision + def test_unstructured_kwargs(self): + self.skipTest("TVP does not process images") + + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + self.skipTest("TVP does not process images") + + @require_torch + @require_vision + def test_structured_kwargs_nested(self): + self.skipTest("TVP does not process images") + + @require_torch + @require_vision + def test_structured_kwargs_nested_from_dict(self): + self.skipTest("TVP does not process images") diff --git a/tests/models/x_clip/test_processor_x_clip.py b/tests/models/x_clip/test_processor_x_clip.py new file mode 100644 index 00000000000000..e9d0bf4b2539ee --- /dev/null +++ b/tests/models/x_clip/test_processor_x_clip.py @@ -0,0 +1,54 @@ +import tempfile +import unittest + +import numpy as np + +from transformers import XCLIPProcessor +from transformers.testing_utils import require_torch, require_vision + +from ...test_processing_common import ProcessorTesterMixin + + +class XCLIPProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "microsoft/xclip-base-patch32" + processor_class = XCLIPProcessor + videos_data_arg_name = "pixel_values" + + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + processor = self.processor_class.from_pretrained(self.from_pretrained_id) + processor.save_pretrained(self.tmpdirname) + + @require_vision + def prepare_video_inputs(self): + return [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] + + @require_torch + @require_vision + def test_image_processor_defaults_preserved_by_image_kwargs(self): + self.skipTest("XCLIP does not process images") + + @require_torch + @require_vision + def test_kwargs_overrides_default_image_processor_kwargs(self): + self.skipTest("XCLIP does not process images") + + @require_torch + @require_vision + def test_unstructured_kwargs(self): + self.skipTest("XCLIP does not process images") + + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + self.skipTest("XCLIP does not process images") + + @require_torch + @require_vision + def test_structured_kwargs_nested(self): + self.skipTest("XCLIP does not process images") + + @require_torch + @require_vision + def test_structured_kwargs_nested_from_dict(self): + self.skipTest("XCLIP does not process images") From 5fd2c32673d2faf69f70e0412fbbd4e0996335c8 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Mon, 19 Aug 2024 21:20:19 +0800 Subject: [PATCH 15/25] fix docs --- src/transformers/models/flava/processing_flava.py | 8 ++++---- .../instructblipvideo/processing_instructblipvideo.py | 9 +++++---- src/transformers/models/vilt/processing_vilt.py | 10 +++++----- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index d1da077a185043..f40e76698032c6 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -102,14 +102,14 @@ def __call__( [`BertTokenizerFast.__call__`] to prepare text for the model. Args: - text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`ImageInput`, *optional*): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index 2a60f7ab3ef07a..6e6ee8eb865aba 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -100,14 +100,15 @@ def __call__( [`BertTokenizerFast.__call__`] to prepare text for the model. Args: - text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`ImageInput`, *optional*): + NOTE: Use `videos` instead. We only left this here for backwards compatibility. The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). videos (`VideoInput`, *optional*): The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py index be8729e297a875..c5d22502096d31 100644 --- a/src/transformers/models/vilt/processing_vilt.py +++ b/src/transformers/models/vilt/processing_vilt.py @@ -97,14 +97,14 @@ def __call__( [`BertTokenizerFast.__call__`] to prepare text for the model. Args: - text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`ImageInput`, *optional*): + images (`ImageInput`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a number of channels, H and W are image height and width. + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: From 9e00f6875e8b90c45434b4a073c6cf78b0a3ffac Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Tue, 20 Aug 2024 18:21:17 +0800 Subject: [PATCH 16/25] address @zucchini-nlp's comments --- .../models/mgp_str/processing_mgp_str.py | 25 ++++++++++++------- .../models/tvp/image_processing_tvp.py | 24 ++++++------------ .../videomae/image_processing_videomae.py | 9 ++++++- .../models/vivit/image_processing_vivit.py | 8 +++++- tests/models/tvp/test_processor_tvp.py | 6 ----- tests/models/x_clip/test_processor_x_clip.py | 6 ----- tests/test_processing_common.py | 2 +- 7 files changed, 39 insertions(+), 41 deletions(-) diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py index 7e30a0336b809f..169d8adcec7b8a 100644 --- a/src/transformers/models/mgp_str/processing_mgp_str.py +++ b/src/transformers/models/mgp_str/processing_mgp_str.py @@ -78,28 +78,35 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): FutureWarning, ) feature_extractor = kwargs.pop("feature_extractor") - if "char_tokenizer" in kwargs: - warnings.warn( - "The `char_tokenizer` argument is deprecated and will be removed in future versions, use `tokenizer`" - " instead.", - FutureWarning, - ) - char_tokenizer = kwargs.pop("char_tokenizer") image_processor = image_processor if image_processor is not None else feature_extractor - tokenizer = tokenizer if tokenizer is not None else char_tokenizer if image_processor is None: raise ValueError("You need to specify an `image_processor`.") if tokenizer is None: raise ValueError("You need to specify a `tokenizer`.") self.tokenizer = tokenizer - self.char_tokenizer = tokenizer # For backwards compatibility self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") self.wp_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") super().__init__(image_processor, tokenizer) + @property + def char_tokenizer(self): + warnings.warn( + "The `char_tokenizer` attribute is deprecated and will be removed in future versions, use `tokenizer` instead.", + FutureWarning, + ) + return self.tokenizer + + @char_tokenizer.setter + def char_tokenizer(self, value): + warnings.warn( + "The `char_tokenizer` attribute is deprecated and will be removed in future versions, use `tokenizer` instead.", + FutureWarning, + ) + self.tokenizer = value + def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 4e9618eef17084..7a4c5db004671e 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -50,7 +50,13 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched def make_batched(videos) -> List[List[ImageInput]]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + if isinstance(videos, np.ndarray) and videos.ndim == 5: + return videos + + elif isinstance(videos, np.ndarray) and videos.ndim == 4: + return [videos] + + elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): @@ -305,30 +311,20 @@ def _preprocess_image( # All transformations expect numpy arrays. image = to_numpy_array(image) - print(f"{image.shape = }") - if do_resize: image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) - print(f"{image.shape = }") - if do_center_crop: image = self.center_crop(image, size=crop_size, input_data_format=input_data_format) - print(f"{image.shape = }") - if do_rescale: image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) - print(f"{image.shape = }") - if do_normalize: image = self.normalize( image=image.astype(np.float32), mean=image_mean, std=image_std, input_data_format=input_data_format ) - print(f"{image.shape = }") - if do_pad: image = self.pad_image( image=image, @@ -338,18 +334,12 @@ def _preprocess_image( input_data_format=input_data_format, ) - print(f"{image.shape = }") - # the pretrained checkpoints assume images are BGR, not RGB if do_flip_channel_order: image = flip_channel_order(image=image, input_data_format=input_data_format) - print(f"{image.shape = }") - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) - print(f"{image.shape = }") - return image @filter_out_non_signature_kwargs() diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py index 413589523aa675..c21210faf6670c 100644 --- a/src/transformers/models/videomae/image_processing_videomae.py +++ b/src/transformers/models/videomae/image_processing_videomae.py @@ -47,8 +47,15 @@ logger = logging.get_logger(__name__) +# Copied from transformers.models.vivit.image_processing_vivit.make_batched def make_batched(videos) -> List[List[ImageInput]]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + if isinstance(videos, np.ndarray) and videos.ndim == 5: + return videos + + elif isinstance(videos, np.ndarray) and videos.ndim == 4: + return [videos] + + elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py index 5f251bbd1b95b9..fb959e9f1eddb2 100644 --- a/src/transformers/models/vivit/image_processing_vivit.py +++ b/src/transformers/models/vivit/image_processing_vivit.py @@ -51,7 +51,13 @@ def make_batched(videos) -> List[List[ImageInput]]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + if isinstance(videos, np.ndarray) and videos.ndim == 5: + return videos + + elif isinstance(videos, np.ndarray) and videos.ndim == 4: + return [videos] + + elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): diff --git a/tests/models/tvp/test_processor_tvp.py b/tests/models/tvp/test_processor_tvp.py index 8f5e0bd6b5d05d..40d700e0beea15 100644 --- a/tests/models/tvp/test_processor_tvp.py +++ b/tests/models/tvp/test_processor_tvp.py @@ -2,8 +2,6 @@ import tempfile import unittest -import numpy as np - from transformers import TvpProcessor from transformers.testing_utils import require_torch, require_vision @@ -20,10 +18,6 @@ def setUp(self): processor = self.processor_class.from_pretrained(self.from_pretrained_id) processor.save_pretrained(self.tmpdirname) - @require_vision - def prepare_video_inputs(self): - return [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] - @require_torch @require_vision def test_video_processor_defaults_preserved_by_kwargs(self): diff --git a/tests/models/x_clip/test_processor_x_clip.py b/tests/models/x_clip/test_processor_x_clip.py index e9d0bf4b2539ee..5b34855a67252a 100644 --- a/tests/models/x_clip/test_processor_x_clip.py +++ b/tests/models/x_clip/test_processor_x_clip.py @@ -1,8 +1,6 @@ import tempfile import unittest -import numpy as np - from transformers import XCLIPProcessor from transformers.testing_utils import require_torch, require_vision @@ -19,10 +17,6 @@ def setUp(self): processor = self.processor_class.from_pretrained(self.from_pretrained_id) processor.save_pretrained(self.tmpdirname) - @require_vision - def prepare_video_inputs(self): - return [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] - @require_torch @require_vision def test_image_processor_defaults_preserved_by_image_kwargs(self): diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index ebb5e6f74f3d07..53cfcf5520c053 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -93,7 +93,7 @@ def prepare_image_inputs(self): @require_vision def prepare_video_inputs(self): - return [np.random.randint(255, size=(4, 3, 30, 400), dtype=np.uint8)] + return np.random.randint(255, size=(1, 4, 3, 30, 400), dtype=np.uint8) def test_processor_to_json_string(self): processor = self.get_processor() From a2672a6dc10f7b5d2ebed3e4733b0116763f2a62 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Tue, 20 Aug 2024 18:32:25 +0800 Subject: [PATCH 17/25] simplify implementations --- .../models/altclip/processing_altclip.py | 15 +++++-------- .../models/flava/processing_flava.py | 14 +++++-------- src/transformers/models/git/processing_git.py | 21 +++++++------------ .../models/mgp_str/processing_mgp_str.py | 17 +++++++-------- .../models/siglip/processing_siglip.py | 17 +++++---------- src/transformers/models/tvp/processing_tvp.py | 14 ++++++------- .../models/vilt/processing_vilt.py | 14 +++++++------ .../models/x_clip/processing_x_clip.py | 17 ++++++--------- 8 files changed, 50 insertions(+), 79 deletions(-) diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index 9d01f96afac5e3..51ea3032053c3d 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -116,19 +116,14 @@ def __call__( **kwargs, ) + data = {} if text is not None: - encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) - + text_features = self.tokenizer(text, **output_kwargs["text_kwargs"]) + data.update(text_features) if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) - - return_tensors = output_kwargs["common_kwargs"].get("return_tensors") - if text is not None and images is not None: - return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors) - elif text is not None: - return BatchFeature(data=dict(**encoding), tensor_type=return_tensors) - else: - return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) + data.update(image_features) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index f40e76698032c6..ace0434d0bd2f7 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -130,18 +130,14 @@ def __call__( **kwargs, ) + data = {} if text is not None: - encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + data.update(text_features) if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) - - return_tensors = output_kwargs["common_kwargs"].get("return_tensors") - if text is not None and images is not None: - return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors) - elif text is not None: - return BatchFeature(data=dict(**encoding), tensor_type=return_tensors) - else: - return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) + data.update(image_features) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py index 3d1166e3b06d97..5abb1990233ac9 100644 --- a/src/transformers/models/git/processing_git.py +++ b/src/transformers/models/git/processing_git.py @@ -91,28 +91,23 @@ def __call__( - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + if text is None and images is None: + raise ValueError("You have to specify either text or images. Both cannot be none.") + output_kwargs = self._merge_kwargs( GitProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) - if text is None and images is None: - raise ValueError("You have to specify either text or images. Both cannot be none.") - + data = {} if text is not None: - encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) - + text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + data.update(text_features) if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) - - return_tensors = output_kwargs["common_kwargs"].get("return_tensors") - if text is not None and images is not None: - return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors) - elif text is not None: - return BatchFeature(data=dict(**encoding), tensor_type=return_tensors) - else: - return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) + data.update(image_features) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py index 169d8adcec7b8a..11ff8653779301 100644 --- a/src/transformers/models/mgp_str/processing_mgp_str.py +++ b/src/transformers/models/mgp_str/processing_mgp_str.py @@ -139,6 +139,7 @@ def __call__( `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + if images is None and text is None: raise ValueError("You need to specify either an `images` or `text` input to process.") @@ -148,18 +149,14 @@ def __call__( **kwargs, ) + data = {} + if text is not None: + text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + data.update(text_features) if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) - if text is not None: - encodings = self.tokenizer(text, **output_kwargs["text_kwargs"]) - - return_tensors = output_kwargs["common_kwargs"].get("return_tensors") - if text is not None and images is not None: - return BatchFeature(data=dict(**image_features, labels=encodings["input_ids"]), tensor_type=return_tensors) - elif text is not None: - return BatchFeature(data=dict(**encodings), tensor_type=return_tensors) - else: - return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) + data.update(image_features) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, sequences): """ diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py index ac3f659d3efaf9..f8f3e8f9eaff49 100644 --- a/src/transformers/models/siglip/processing_siglip.py +++ b/src/transformers/models/siglip/processing_siglip.py @@ -106,21 +106,14 @@ def __call__( **kwargs, ) + data = {} if text is not None: - encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) - + text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + data.update(text_features) if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) - - return_tensors = output_kwargs["common_kwargs"].get("return_tensors") - if text is not None and images is not None: - return BatchFeature( - data=dict(**encoding, pixel_values=image_features.pixel_values), tensor_type=return_tensors - ) - elif text is not None: - return BatchFeature(data=dict(**encoding), tensor_type=return_tensors) - else: - return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) + data.update(image_features) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py index 96a85a984d8f84..7ce29d9e9e1a53 100644 --- a/src/transformers/models/tvp/processing_tvp.py +++ b/src/transformers/models/tvp/processing_tvp.py @@ -121,16 +121,14 @@ def __call__( **kwargs, ) - encoding = {} + data = {} if text is not None: - textual_input = self.tokenizer(text, **output_kwargs["text_kwargs"]) - encoding.update(textual_input) - + text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + data.update(text_features) if videos is not None: - image_features = self.image_processor(videos, **output_kwargs["videos_kwargs"]) - encoding.update(image_features) - - return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) + video_features = self.image_processor(videos, **output_kwargs["videos_kwargs"]) + data.update(video_features) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py index c5d22502096d31..562e5a3f94a955 100644 --- a/src/transformers/models/vilt/processing_vilt.py +++ b/src/transformers/models/vilt/processing_vilt.py @@ -122,12 +122,14 @@ def __call__( **kwargs, ) - encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) - # add pixel_values + pixel_mask - encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"]) - encoding.update(encoding_image_processor) - - return BatchFeature(data=dict(**encoding), tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) + data = {} + if text is not None: + text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + data.update(text_features) + if images is not None: + images_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + data.update(images_features) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py index fa2cc860fd5d05..f722ef37d498a8 100644 --- a/src/transformers/models/x_clip/processing_x_clip.py +++ b/src/transformers/models/x_clip/processing_x_clip.py @@ -118,19 +118,14 @@ def __call__( **kwargs, ) + data = {} if text is not None: - encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) - + text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + data.update(text_features) if videos is not None: - image_features = self.image_processor(videos, **output_kwargs["videos_kwargs"]) - - return_tensors = output_kwargs["common_kwargs"].get("return_tensors") - if text is not None and videos is not None: - return BatchFeature(data=dict(**encoding, **image_features), tensor_type=return_tensors) - elif text is not None: - return BatchFeature(data=dict(**encoding), tensor_type=return_tensors) - else: - return BatchFeature(data=dict(**image_features), tensor_type=return_tensors) + video_features = self.image_processor(videos, **output_kwargs["images_kwargs"]) + data.update(video_features) + return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, *args, **kwargs): """ From 721d1c81944564d1c30b83bd286ea72721a29523 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Tue, 20 Aug 2024 18:52:01 +0800 Subject: [PATCH 18/25] uniformize implementations of make_batched_videos and make_batched_images --- .../chameleon/image_processing_chameleon.py | 3 ++- .../image_processing_instructblipvideo.py | 21 ++++++++++++------- .../llava_next/image_processing_llava_next.py | 2 +- .../image_processing_llava_next_video.py | 21 ++++++++++++------- .../models/tvp/image_processing_tvp.py | 14 ++++++++----- .../image_processing_video_llava.py | 21 ++++++++++++------- .../videomae/image_processing_videomae.py | 14 ++++++++----- .../models/vivit/image_processing_vivit.py | 12 +++++++---- 8 files changed, 71 insertions(+), 37 deletions(-) diff --git a/src/transformers/models/chameleon/image_processing_chameleon.py b/src/transformers/models/chameleon/image_processing_chameleon.py index a23fdbed028867..2b0bd0024f3be1 100644 --- a/src/transformers/models/chameleon/image_processing_chameleon.py +++ b/src/transformers/models/chameleon/image_processing_chameleon.py @@ -44,7 +44,8 @@ import PIL -def make_batched_images(images) -> List[List[ImageInput]]: +# Copied from transformers.models.llava_next.image_processing_llava_next.make_batched_images +def make_batched_images(images) -> List[ImageInput]: """ Accepts images in list or nested list format, and makes a list of images for preprocessing. diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py index 131b8fe57bd665..cf9074fe1bbecd 100644 --- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -47,18 +47,25 @@ logger = logging.get_logger(__name__) +# Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + if isinstance(videos, np.ndarray) and videos.ndim == 5: return videos + elif isinstance(videos, np.ndarray) and videos.ndim == 4: + return [videos] + + elif isinstance(videos, (list, tuple)): + if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + return videos + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], PIL.Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] + return [videos] - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] + elif is_valid_image(videos): + return [[videos]] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index f8237d0078bf0a..c5a0eaa63739c2 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -53,7 +53,7 @@ from PIL import Image -def make_batched_images(images) -> List[List[ImageInput]]: +def make_batched_images(images) -> List[ImageInput]: """ Accepts images in list or nested list format, and makes a list of images for preprocessing. diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py index e16e71875bb2c8..9991d3eb6d1afd 100644 --- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py @@ -49,18 +49,25 @@ from PIL import Image +# Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + if isinstance(videos, np.ndarray) and videos.ndim == 5: return videos + elif isinstance(videos, np.ndarray) and videos.ndim == 4: + return [videos] + + elif isinstance(videos, (list, tuple)): + if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + return videos + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] + return [videos] - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] + elif is_valid_image(videos): + return [[videos]] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 7a4c5db004671e..07a657e1afb229 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -32,6 +32,7 @@ ChannelDimension, ImageInput, PILImageResampling, + VideoInput, get_image_size, is_valid_image, to_numpy_array, @@ -48,16 +49,19 @@ logger = logging.get_logger(__name__) -# Copied from transformers.models.vivit.image_processing_vivit.make_batched -def make_batched(videos) -> List[List[ImageInput]]: +# Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos +def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, np.ndarray) and videos.ndim == 5: return videos elif isinstance(videos, np.ndarray) and videos.ndim == 4: return [videos] - elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos + elif isinstance(videos, (list, tuple)): + if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): return [videos] @@ -449,7 +453,7 @@ def preprocess( "torch.Tensor, tf.Tensor or jax.ndarray." ) - videos = make_batched(videos) + videos = make_batched_videos(videos) videos = [ np.array( diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index 3e77110c7d45a8..412c0499bc36e5 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -50,18 +50,25 @@ import PIL +# Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + if isinstance(videos, np.ndarray) and videos.ndim == 5: return videos + elif isinstance(videos, np.ndarray) and videos.ndim == 4: + return [videos] + + elif isinstance(videos, (list, tuple)): + if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + return videos + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], PIL.Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] + return [videos] - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] + elif is_valid_image(videos): + return [[videos]] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py index c21210faf6670c..3914fd867dfb65 100644 --- a/src/transformers/models/videomae/image_processing_videomae.py +++ b/src/transformers/models/videomae/image_processing_videomae.py @@ -30,6 +30,7 @@ ChannelDimension, ImageInput, PILImageResampling, + VideoInput, infer_channel_dimension_format, is_scaled_image, is_valid_image, @@ -47,16 +48,19 @@ logger = logging.get_logger(__name__) -# Copied from transformers.models.vivit.image_processing_vivit.make_batched -def make_batched(videos) -> List[List[ImageInput]]: +# Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos +def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, np.ndarray) and videos.ndim == 5: return videos elif isinstance(videos, np.ndarray) and videos.ndim == 4: return [videos] - elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos + elif isinstance(videos, (list, tuple)): + if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): return [videos] @@ -324,7 +328,7 @@ def preprocess( "torch.Tensor, tf.Tensor or jax.ndarray." ) - videos = make_batched(videos) + videos = make_batched_videos(videos) videos = [ [ diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py index fb959e9f1eddb2..171805e52229c7 100644 --- a/src/transformers/models/vivit/image_processing_vivit.py +++ b/src/transformers/models/vivit/image_processing_vivit.py @@ -34,6 +34,7 @@ ChannelDimension, ImageInput, PILImageResampling, + VideoInput, infer_channel_dimension_format, is_scaled_image, is_valid_image, @@ -50,15 +51,18 @@ logger = logging.get_logger(__name__) -def make_batched(videos) -> List[List[ImageInput]]: +def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, np.ndarray) and videos.ndim == 5: return videos elif isinstance(videos, np.ndarray) and videos.ndim == 4: return [videos] - elif isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos + elif isinstance(videos, (list, tuple)): + if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): + return videos + if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): return [videos] @@ -381,7 +385,7 @@ def preprocess( "torch.Tensor, tf.Tensor or jax.ndarray." ) - videos = make_batched(videos) + videos = make_batched_videos(videos) videos = [ [ From c0f3abb60760ee2934f5bfe33b6a05ab7e96290b Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Tue, 20 Aug 2024 19:07:26 +0800 Subject: [PATCH 19/25] fix instructblipvideo tests --- .../instructblipvideo/image_processing_instructblipvideo.py | 5 ++--- .../llava_next_video/image_processing_llava_next_video.py | 5 ++--- src/transformers/models/tvp/image_processing_tvp.py | 5 ++--- .../models/video_llava/image_processing_video_llava.py | 5 ++--- .../models/videomae/image_processing_videomae.py | 5 ++--- 5 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py index cf9074fe1bbecd..827d1bfc0bbb8d 100644 --- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -60,9 +60,8 @@ def make_batched_videos(videos) -> List[VideoInput]: return videos if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - return [videos] + if is_valid_image(videos[0]): + return [videos] elif is_valid_image(videos): return [[videos]] diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py index 9991d3eb6d1afd..3a53f222f5f226 100644 --- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py @@ -62,9 +62,8 @@ def make_batched_videos(videos) -> List[VideoInput]: return videos if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - return [videos] + if is_valid_image(videos[0]): + return [videos] elif is_valid_image(videos): return [[videos]] diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 07a657e1afb229..96bdf9855f6666 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -62,9 +62,8 @@ def make_batched_videos(videos) -> List[VideoInput]: return videos if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - return [videos] + if is_valid_image(videos[0]): + return [videos] elif is_valid_image(videos): return [[videos]] diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index 412c0499bc36e5..04d24be50eb38c 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -63,9 +63,8 @@ def make_batched_videos(videos) -> List[VideoInput]: return videos if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - return [videos] + if is_valid_image(videos[0]): + return [videos] elif is_valid_image(videos): return [[videos]] diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py index 3914fd867dfb65..12c86bdfd6e7f7 100644 --- a/src/transformers/models/videomae/image_processing_videomae.py +++ b/src/transformers/models/videomae/image_processing_videomae.py @@ -61,9 +61,8 @@ def make_batched_videos(videos) -> List[VideoInput]: return videos if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - return [videos] + if is_valid_image(videos[0]): + return [videos] elif is_valid_image(videos): return [[videos]] From bb5debdf49e555143d3fc1ca55db8288f45fc5c2 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Tue, 20 Aug 2024 19:12:22 +0800 Subject: [PATCH 20/25] fix copies --- .../image_processing_instructblipvideo.py | 4 ++-- .../llava_next_video/image_processing_llava_next_video.py | 4 ++-- src/transformers/models/tvp/image_processing_tvp.py | 4 ++-- .../models/video_llava/image_processing_video_llava.py | 4 ++-- .../models/videomae/image_processing_videomae.py | 4 ++-- src/transformers/models/vivit/image_processing_vivit.py | 7 +++---- 6 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py index 827d1bfc0bbb8d..506e2c8ef9521f 100644 --- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -58,9 +58,9 @@ def make_batched_videos(videos) -> List[VideoInput]: elif isinstance(videos, (list, tuple)): if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - if is_valid_image(videos[0]): + elif is_valid_image(videos[0]): return [videos] elif is_valid_image(videos): diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py index 3a53f222f5f226..1efd9d6e6af3e3 100644 --- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py @@ -60,9 +60,9 @@ def make_batched_videos(videos) -> List[VideoInput]: elif isinstance(videos, (list, tuple)): if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - if is_valid_image(videos[0]): + elif is_valid_image(videos[0]): return [videos] elif is_valid_image(videos): diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 96bdf9855f6666..175b0df07f68fd 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -60,9 +60,9 @@ def make_batched_videos(videos) -> List[VideoInput]: elif isinstance(videos, (list, tuple)): if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - if is_valid_image(videos[0]): + elif is_valid_image(videos[0]): return [videos] elif is_valid_image(videos): diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index 04d24be50eb38c..87a54f79744a41 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -61,9 +61,9 @@ def make_batched_videos(videos) -> List[VideoInput]: elif isinstance(videos, (list, tuple)): if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - if is_valid_image(videos[0]): + elif is_valid_image(videos[0]): return [videos] elif is_valid_image(videos): diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py index 12c86bdfd6e7f7..0de895b51fbdea 100644 --- a/src/transformers/models/videomae/image_processing_videomae.py +++ b/src/transformers/models/videomae/image_processing_videomae.py @@ -59,9 +59,9 @@ def make_batched_videos(videos) -> List[VideoInput]: elif isinstance(videos, (list, tuple)): if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - if is_valid_image(videos[0]): + elif is_valid_image(videos[0]): return [videos] elif is_valid_image(videos): diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py index 171805e52229c7..06e70662f270ce 100644 --- a/src/transformers/models/vivit/image_processing_vivit.py +++ b/src/transformers/models/vivit/image_processing_vivit.py @@ -61,11 +61,10 @@ def make_batched_videos(videos) -> List[VideoInput]: elif isinstance(videos, (list, tuple)): if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - if isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: + elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - return [videos] + elif is_valid_image(videos[0]): + return [videos] elif is_valid_image(videos): return [[videos]] From d9bc2e924eb16045a3cb3960c877f0a65000ca6c Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Tue, 20 Aug 2024 20:01:34 +0800 Subject: [PATCH 21/25] fix make_batched_videos --- .../image_processing_instructblipvideo.py | 19 +++++++----------- .../image_processing_llava_next_video.py | 19 +++++++----------- .../models/tvp/image_processing_tvp.py | 19 +++++++----------- .../image_processing_video_llava.py | 20 +++++++------------ .../videomae/image_processing_videomae.py | 19 +++++++----------- .../models/vivit/image_processing_vivit.py | 19 +++++++----------- 6 files changed, 42 insertions(+), 73 deletions(-) diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py index 506e2c8ef9521f..e8a7a75120160f 100644 --- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -49,22 +49,17 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, np.ndarray) and videos.ndim == 5: + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - elif isinstance(videos, np.ndarray) and videos.ndim == 4: - return [videos] - - elif isinstance(videos, (list, tuple)): - if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: - return videos - elif is_valid_image(videos[0]): + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], PIL.Image.Image): return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] - elif is_valid_image(videos): - return [[videos]] + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py index 1efd9d6e6af3e3..3196974855550a 100644 --- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py @@ -51,22 +51,17 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, np.ndarray) and videos.ndim == 5: + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - elif isinstance(videos, np.ndarray) and videos.ndim == 4: - return [videos] - - elif isinstance(videos, (list, tuple)): - if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: - return videos - elif is_valid_image(videos[0]): + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], PIL.Image.Image): return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] - elif is_valid_image(videos): - return [[videos]] + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 175b0df07f68fd..8b1a737e998ae7 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -51,22 +51,17 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, np.ndarray) and videos.ndim == 5: + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - elif isinstance(videos, np.ndarray) and videos.ndim == 4: - return [videos] - - elif isinstance(videos, (list, tuple)): - if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: - return videos - elif is_valid_image(videos[0]): + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], PIL.Image.Image): return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] - elif is_valid_image(videos): - return [[videos]] + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index 87a54f79744a41..85dc131c6a5db2 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -34,7 +34,6 @@ VideoInput, infer_channel_dimension_format, is_scaled_image, - is_valid_image, make_list_of_images, to_numpy_array, valid_images, @@ -52,22 +51,17 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, np.ndarray) and videos.ndim == 5: + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - elif isinstance(videos, np.ndarray) and videos.ndim == 4: - return [videos] - - elif isinstance(videos, (list, tuple)): - if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: - return videos - elif is_valid_image(videos[0]): + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], PIL.Image.Image): return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] - elif is_valid_image(videos): - return [[videos]] + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py index 0de895b51fbdea..8df92b3516fc0f 100644 --- a/src/transformers/models/videomae/image_processing_videomae.py +++ b/src/transformers/models/videomae/image_processing_videomae.py @@ -50,22 +50,17 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, np.ndarray) and videos.ndim == 5: + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - elif isinstance(videos, np.ndarray) and videos.ndim == 4: - return [videos] - - elif isinstance(videos, (list, tuple)): - if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: - return videos - elif is_valid_image(videos[0]): + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], PIL.Image.Image): return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] - elif is_valid_image(videos): - return [[videos]] + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py index 06e70662f270ce..0cca6305f2a1cc 100644 --- a/src/transformers/models/vivit/image_processing_vivit.py +++ b/src/transformers/models/vivit/image_processing_vivit.py @@ -52,22 +52,17 @@ def make_batched_videos(videos) -> List[VideoInput]: - if isinstance(videos, np.ndarray) and videos.ndim == 5: + if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - elif isinstance(videos, np.ndarray) and videos.ndim == 4: - return [videos] - - elif isinstance(videos, (list, tuple)): - if isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - elif isinstance(videos[0], np.ndarray) and videos[0].ndim == 4: - return videos - elif is_valid_image(videos[0]): + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], PIL.Image.Image): return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] - elif is_valid_image(videos): - return [[videos]] + elif is_valid_image(videos) and len(videos.shape) == 4: + return [list(videos)] raise ValueError(f"Could not make batched video from {videos}") From f6e7914aa091befe0e44490f00a4ace1bb899370 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Tue, 20 Aug 2024 20:07:17 +0800 Subject: [PATCH 22/25] fix MGP-str --- .../llava_next_video/image_processing_llava_next_video.py | 4 ++-- src/transformers/models/mgp_str/processing_mgp_str.py | 4 ++++ .../models/video_llava/image_processing_video_llava.py | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py index 3196974855550a..705c6adc42a536 100644 --- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py @@ -46,7 +46,7 @@ if is_vision_available(): - from PIL import Image + import PIL # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos @@ -213,7 +213,7 @@ def _preprocess( do_convert_rgb: bool = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, - ) -> Image.Image: + ) -> PIL.Image.Image: """ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py index 11ff8653779301..d194017669c460 100644 --- a/src/transformers/models/mgp_str/processing_mgp_str.py +++ b/src/transformers/models/mgp_str/processing_mgp_str.py @@ -156,6 +156,10 @@ def __call__( if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) data.update(image_features) + # TODO: remove this after standardizing the outputs of vision-language processors + if "input_ids" in data: + data["labels"] = data["input_ids"] + data.pop("input_ids") return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, sequences): diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index 85dc131c6a5db2..befc4c017260eb 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -34,6 +34,7 @@ VideoInput, infer_channel_dimension_format, is_scaled_image, + is_valid_image, make_list_of_images, to_numpy_array, valid_images, From acd2c562ec270b6c0e50a5f48b3637ebd5d403cd Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Tue, 20 Aug 2024 23:27:33 +0800 Subject: [PATCH 23/25] fix make_batched_videos --- .../image_processing_instructblipvideo.py | 9 ++++++--- .../image_processing_llava_next_video.py | 9 ++++++--- src/transformers/models/tvp/image_processing_tvp.py | 9 ++++++--- .../models/video_llava/image_processing_video_llava.py | 9 ++++++--- .../models/videomae/image_processing_videomae.py | 9 ++++++--- src/transformers/models/vivit/image_processing_vivit.py | 9 ++++++--- 6 files changed, 36 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py index e8a7a75120160f..cd163b370ebd20 100644 --- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -56,10 +56,13 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: - return [list(video) for video in videos] + return [videos] - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] + elif is_valid_image(videos): + if len(videos.shape) == 5: + return videos + elif len(videos.shape) == 4: + return [videos] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py index 705c6adc42a536..4c96486e413435 100644 --- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py @@ -58,10 +58,13 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: - return [list(video) for video in videos] + return [videos] - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] + elif is_valid_image(videos): + if len(videos.shape) == 5: + return videos + elif len(videos.shape) == 4: + return [videos] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 8b1a737e998ae7..d7f80cbaed1c2a 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -58,10 +58,13 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: - return [list(video) for video in videos] + return [videos] - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] + elif is_valid_image(videos): + if len(videos.shape) == 5: + return videos + elif len(videos.shape) == 4: + return [videos] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index befc4c017260eb..a96704b48503e6 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -59,10 +59,13 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: - return [list(video) for video in videos] + return [videos] - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] + elif is_valid_image(videos): + if len(videos.shape) == 5: + return videos + elif len(videos.shape) == 4: + return [videos] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py index 8df92b3516fc0f..be0ed4baf88ec2 100644 --- a/src/transformers/models/videomae/image_processing_videomae.py +++ b/src/transformers/models/videomae/image_processing_videomae.py @@ -57,10 +57,13 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: - return [list(video) for video in videos] + return [videos] - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] + elif is_valid_image(videos): + if len(videos.shape) == 5: + return videos + elif len(videos.shape) == 4: + return [videos] raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py index 0cca6305f2a1cc..99587a6a27753b 100644 --- a/src/transformers/models/vivit/image_processing_vivit.py +++ b/src/transformers/models/vivit/image_processing_vivit.py @@ -59,10 +59,13 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: - return [list(video) for video in videos] + return [videos] - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] + elif is_valid_image(videos): + if len(videos.shape) == 5: + return videos + elif len(videos.shape) == 4: + return [videos] raise ValueError(f"Could not make batched video from {videos}") From 5c39f4f19a091c9cd648fd1376e9b374a8fdad95 Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Tue, 20 Aug 2024 23:44:17 +0800 Subject: [PATCH 24/25] fix make_batched_videos --- .../instructblipvideo/image_processing_instructblipvideo.py | 2 ++ .../llava_next_video/image_processing_llava_next_video.py | 2 ++ src/transformers/models/tvp/image_processing_tvp.py | 2 ++ .../models/video_llava/image_processing_video_llava.py | 2 ++ src/transformers/models/videomae/image_processing_videomae.py | 2 ++ src/transformers/models/vivit/image_processing_vivit.py | 2 ++ 6 files changed, 12 insertions(+) diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py index cd163b370ebd20..a686806a97451b 100644 --- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -56,6 +56,8 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: + return videos + elif len(videos[0].shape) == 3: return [videos] elif is_valid_image(videos): diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py index 4c96486e413435..9ca191644302ef 100644 --- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py @@ -58,6 +58,8 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: + return videos + elif len(videos[0].shape) == 3: return [videos] elif is_valid_image(videos): diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index d7f80cbaed1c2a..8f05b3e966f1c8 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -58,6 +58,8 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: + return videos + elif len(videos[0].shape) == 3: return [videos] elif is_valid_image(videos): diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index a96704b48503e6..321e36eb50bc12 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -59,6 +59,8 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: + return videos + elif len(videos[0].shape) == 3: return [videos] elif is_valid_image(videos): diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py index be0ed4baf88ec2..628c9cb5cd66b9 100644 --- a/src/transformers/models/videomae/image_processing_videomae.py +++ b/src/transformers/models/videomae/image_processing_videomae.py @@ -57,6 +57,8 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: + return videos + elif len(videos[0].shape) == 3: return [videos] elif is_valid_image(videos): diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py index 99587a6a27753b..51d646922feba9 100644 --- a/src/transformers/models/vivit/image_processing_vivit.py +++ b/src/transformers/models/vivit/image_processing_vivit.py @@ -59,6 +59,8 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos[0], PIL.Image.Image): return [videos] elif len(videos[0].shape) == 4: + return videos + elif len(videos[0].shape) == 3: return [videos] elif is_valid_image(videos): From ea06e458c011c1e0b4f343056a9ffe92494b518a Mon Sep 17 00:00:00 2001 From: Franz Louis Cesista Date: Wed, 21 Aug 2024 00:22:28 +0800 Subject: [PATCH 25/25] fix make_batched_videos --- .../image_processing_instructblipvideo.py | 13 +++++++------ .../image_processing_llava_next_video.py | 13 +++++++------ .../models/mgp_str/processing_mgp_str.py | 13 +++++++++---- src/transformers/models/tvp/image_processing_tvp.py | 13 +++++++------ .../video_llava/image_processing_video_llava.py | 13 +++++++------ .../models/videomae/image_processing_videomae.py | 13 +++++++------ .../models/vivit/image_processing_vivit.py | 13 +++++++------ 7 files changed, 51 insertions(+), 40 deletions(-) diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py index a686806a97451b..093aab0c4d2cb6 100644 --- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -50,21 +50,22 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos + if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3: + return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], PIL.Image.Image): + if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3: return [videos] elif len(videos[0].shape) == 4: return videos - elif len(videos[0].shape) == 3: - return [videos] elif is_valid_image(videos): - if len(videos.shape) == 5: - return videos + if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3: + return [[videos]] elif len(videos.shape) == 4: return [videos] + elif len(videos.shape) == 5: + return videos raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py index 9ca191644302ef..1000d8de635699 100644 --- a/src/transformers/models/llava_next_video/image_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/image_processing_llava_next_video.py @@ -52,21 +52,22 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos + if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3: + return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], PIL.Image.Image): + if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3: return [videos] elif len(videos[0].shape) == 4: return videos - elif len(videos[0].shape) == 3: - return [videos] elif is_valid_image(videos): - if len(videos.shape) == 5: - return videos + if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3: + return [[videos]] elif len(videos.shape) == 4: return [videos] + elif len(videos.shape) == 5: + return videos raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py index d194017669c460..4a3bdba95ad829 100644 --- a/src/transformers/models/mgp_str/processing_mgp_str.py +++ b/src/transformers/models/mgp_str/processing_mgp_str.py @@ -155,11 +155,16 @@ def __call__( data.update(text_features) if images is not None: image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) - data.update(image_features) - # TODO: remove this after standardizing the outputs of vision-language processors if "input_ids" in data: - data["labels"] = data["input_ids"] - data.pop("input_ids") + # For backwards compatibility. MGP-STR doesn't actually use the labels, but the tests do. + # And users also expect the labels--and only the labels--to be returned. + # This requirement, however, may be relaxed in future versions. + data = { + "pixel_values": image_features["pixel_values"], + "labels": data["input_ids"], + } + else: + data.update(image_features) return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) def batch_decode(self, sequences): diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 8f05b3e966f1c8..60588d213477f3 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -52,21 +52,22 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos + if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3: + return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], PIL.Image.Image): + if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3: return [videos] elif len(videos[0].shape) == 4: return videos - elif len(videos[0].shape) == 3: - return [videos] elif is_valid_image(videos): - if len(videos.shape) == 5: - return videos + if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3: + return [[videos]] elif len(videos.shape) == 4: return [videos] + elif len(videos.shape) == 5: + return videos raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index 321e36eb50bc12..2472b9bdd85417 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -53,21 +53,22 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos + if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3: + return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], PIL.Image.Image): + if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3: return [videos] elif len(videos[0].shape) == 4: return videos - elif len(videos[0].shape) == 3: - return [videos] elif is_valid_image(videos): - if len(videos.shape) == 5: - return videos + if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3: + return [[videos]] elif len(videos.shape) == 4: return [videos] + elif len(videos.shape) == 5: + return videos raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py index 628c9cb5cd66b9..7355e356196ca4 100644 --- a/src/transformers/models/videomae/image_processing_videomae.py +++ b/src/transformers/models/videomae/image_processing_videomae.py @@ -51,21 +51,22 @@ # Copied from transformers.models.vivit.image_processing_vivit.make_batched_videos def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos + if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3: + return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], PIL.Image.Image): + if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3: return [videos] elif len(videos[0].shape) == 4: return videos - elif len(videos[0].shape) == 3: - return [videos] elif is_valid_image(videos): - if len(videos.shape) == 5: - return videos + if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3: + return [[videos]] elif len(videos.shape) == 4: return [videos] + elif len(videos.shape) == 5: + return videos raise ValueError(f"Could not make batched video from {videos}") diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py index 51d646922feba9..b50b09089f5114 100644 --- a/src/transformers/models/vivit/image_processing_vivit.py +++ b/src/transformers/models/vivit/image_processing_vivit.py @@ -53,21 +53,22 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos + if isinstance(videos[0][0], PIL.Image.Image) or len(videos[0][0].shape) == 3: + return videos elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], PIL.Image.Image): + if isinstance(videos[0], PIL.Image.Image) or len(videos[0].shape) == 3: return [videos] elif len(videos[0].shape) == 4: return videos - elif len(videos[0].shape) == 3: - return [videos] elif is_valid_image(videos): - if len(videos.shape) == 5: - return videos + if isinstance(videos, PIL.Image.Image) or len(videos.shape) == 3: + return [[videos]] elif len(videos.shape) == 4: return [videos] + elif len(videos.shape) == 5: + return videos raise ValueError(f"Could not make batched video from {videos}")