From f5d8507bed8db3131ca4b52c5bf4137c867a8331 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Mon, 16 Sep 2024 19:47:59 +0000 Subject: [PATCH] remove optional args and udop uniformization from this PR --- src/transformers/models/udop/modeling_udop.py | 2 +- .../models/udop/processing_udop.py | 163 +++++++---------- src/transformers/processing_utils.py | 64 ------- tests/models/udop/test_processor_udop.py | 63 ++----- tests/test_processing_common.py | 170 ++++++------------ 5 files changed, 145 insertions(+), 317 deletions(-) diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 6f7b6cf060495a..972248daaae599 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -1790,7 +1790,7 @@ def forward( >>> # one can use the various task prefixes (prompts) used during pre-training >>> # e.g. the task prefix for DocVQA is "Question answering. " >>> question = "Question answering. What is the date on the form?" - >>> encoding = processor(image, question, text_pair=words, boxes=boxes, return_tensors="pt") + >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="pt") >>> # autoregressive generation >>> predicted_ids = model.generate(**encoding) diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py index 3d4cfc9ce4334e..2902541d6f5b46 100644 --- a/src/transformers/models/udop/processing_udop.py +++ b/src/transformers/models/udop/processing_udop.py @@ -16,47 +16,12 @@ Processor class for UDOP. """ -import sys from typing import List, Optional, Union -from transformers import logging - -from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs -from ...tokenization_utils_base import PreTokenizedInput, TextInput - - -if sys.version_info >= (3, 11): - from typing import Unpack -else: - from typing_extensions import Unpack - - -logger = logging.get_logger(__name__) - - -class UdopTextKwargs(TextKwargs, total=False): - word_labels: Optional[Union[List[int], List[List[int]]]] - boxes: Union[List[List[int]], List[List[List[int]]]] - - -class UdopProcessorKwargs(ProcessingKwargs, total=False): - text_kwargs: UdopTextKwargs - _defaults = { - "text_kwargs": { - "add_special_tokens": True, - "padding": False, - "truncation": False, - "stride": 0, - "return_overflowing_tokens": False, - "return_special_tokens_mask": False, - "return_offsets_mapping": False, - "return_length": False, - "verbose": True, - }, - "images_kwargs": {}, - } +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType class UdopProcessor(ProcessorMixin): @@ -84,8 +49,6 @@ class UdopProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] image_processor_class = "LayoutLMv3ImageProcessor" tokenizer_class = ("UdopTokenizer", "UdopTokenizerFast") - # For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details. - optional_call_args = ["text_pair"] def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) @@ -94,14 +57,28 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - # The following is to capture `text_pair` argument that may be passed as a positional argument. - # See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details. - # This behavior is only needed for backward compatibility and will be removed in future versions. - *args, - audio=None, - videos=None, - **kwargs: Unpack[UdopProcessorKwargs], - ) -> BatchFeature: + text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None, + boxes: Union[List[List[int]], List[List[List[int]]]] = None, + word_labels: Optional[Union[List[int], List[List[int]]]] = None, + text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + text_pair_target: Optional[ + Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] + ] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchEncoding: """ This method first forwards the `images` argument to [`~UdopImageProcessor.__call__`]. In case [`UdopImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and @@ -116,19 +93,6 @@ def __call__( Please refer to the docstring of the above two methods for more information. """ # verify input - output_kwargs = self._merge_kwargs( - UdopProcessorKwargs, - tokenizer_init_kwargs=self.tokenizer.init_kwargs, - **kwargs, - **self.prepare_and_validate_optional_call_args(*args), - ) - - boxes = output_kwargs["text_kwargs"].pop("boxes", None) - word_labels = output_kwargs["text_kwargs"].pop("word_labels", None) - text_pair = output_kwargs["text_kwargs"].pop("text_pair", None) - return_overflowing_tokens = output_kwargs["text_kwargs"].get("return_overflowing_tokens", False) - return_offsets_mapping = output_kwargs["text_kwargs"].get("return_offsets_mapping", False) - if self.image_processor.apply_ocr and (boxes is not None): raise ValueError( "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True." @@ -142,44 +106,66 @@ def __call__( if return_overflowing_tokens is True and return_offsets_mapping is False: raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.") - if output_kwargs["text_kwargs"].get("text_target", None) is not None: + if text_target is not None: # use the processor to prepare the targets of UDOP return self.tokenizer( - **output_kwargs["text_kwargs"], + text_target=text_target, + text_pair_target=text_pair_target, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, ) else: # use the processor to prepare the inputs of UDOP # first, apply the image processor - features = self.image_processor(images=images, **output_kwargs["images_kwargs"]) - features_words = features.pop("words", None) - features_boxes = features.pop("boxes", None) - - _ = output_kwargs["text_kwargs"].pop("text_target", None) - _ = output_kwargs["text_kwargs"].pop("text_pair_target", None) - output_kwargs["text_kwargs"]["text_pair"] = text_pair - output_kwargs["text_kwargs"]["boxes"] = boxes if boxes is not None else features_boxes - output_kwargs["text_kwargs"]["word_labels"] = word_labels + features = self.image_processor(images=images, return_tensors=return_tensors) # second, apply the tokenizer if text is not None and self.image_processor.apply_ocr and text_pair is None: if isinstance(text, str): text = [text] # add batch dimension (as the image processor always adds a batch dimension) - output_kwargs["text_kwargs"]["text_pair"] = features_words + text_pair = features["words"] encoded_inputs = self.tokenizer( - text=text if text is not None else features_words, - **output_kwargs["text_kwargs"], + text=text if text is not None else features["words"], + text_pair=text_pair if text_pair is not None else None, + boxes=boxes if boxes is not None else features["boxes"], + word_labels=word_labels, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, ) # add pixel values + pixel_values = features.pop("pixel_values") if return_overflowing_tokens is True: - features["pixel_values"] = self.get_overflowing_images( - features["pixel_values"], encoded_inputs["overflow_to_sample_mapping"] - ) - features.update(encoded_inputs) + pixel_values = self.get_overflowing_images(pixel_values, encoded_inputs["overflow_to_sample_mapping"]) + encoded_inputs["pixel_values"] = pixel_values - return features + return encoded_inputs # Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.get_overflowing_images def get_overflowing_images(self, images, overflow_to_sample_mapping): @@ -212,20 +198,7 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) - def post_process_image_text_to_text(self, generated_outputs): - """ - Post-process the output of the model to decode the text. - - Args: - generated_outputs (`torch.Tensor` or `np.ndarray`): - The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` - or `(sequence_length,)`. - - Returns: - `List[str]`: The decoded text. - """ - return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) - @property + # Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.model_input_names def model_input_names(self): - return ["pixel_values", "input_ids", "bbox", "attention_mask"] + return ["input_ids", "bbox", "attention_mask", "pixel_values"] diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index b099476802ecb7..ee28c01189b439 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -35,9 +35,7 @@ from .tokenization_utils_base import ( PaddingStrategy, - PreTokenizedInput, PreTrainedTokenizerBase, - TextInput, TruncationStrategy, ) from .utils import ( @@ -108,9 +106,6 @@ class TextKwargs(TypedDict, total=False): The side on which padding will be applied. """ - text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] - text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] - text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] add_special_tokens: Optional[bool] padding: Union[bool, str, PaddingStrategy] truncation: Union[bool, str, TruncationStrategy] @@ -322,7 +317,6 @@ class ProcessorMixin(PushToHubMixin): attributes = ["feature_extractor", "tokenizer"] optional_attributes = ["chat_template"] - optional_call_args: List[str] = [] # Names need to be attr_class for attr in attributes feature_extractor_class = None tokenizer_class = None @@ -964,64 +958,6 @@ def validate_init_kwargs(processor_config, valid_kwargs): unused_kwargs = {k: processor_config[k] for k in unused_keys} return unused_kwargs - def prepare_and_validate_optional_call_args(self, *args): - """ - Matches optional positional arguments to their corresponding names in `optional_call_args` - in the processor class in the order they are passed to the processor call. - - Note that this should only be used in the `__call__` method of the processors with special - arguments. Special arguments are arguments that aren't `text`, `images`, `audio`, nor `videos` - but also aren't passed to the tokenizer, image processor, etc. Examples of such processors are: - - `CLIPSegProcessor` - - `LayoutLMv2Processor` - - `OwlViTProcessor` - - Also note that passing by position to the processor call is now deprecated and will be disallowed - in future versions. We only have this for backward compatibility. - - Example: - Suppose that the processor class has `optional_call_args = ["arg_name_1", "arg_name_2"]`. - And we define the call method as: - ```python - def __call__( - self, - text: str, - images: Optional[ImageInput] = None, - *arg, - audio=None, - videos=None, - ) - ``` - - Then, if we call the processor as: - ```python - images = [...] - processor("What is common in these images?", images, "arg_value_1", "arg_value_2") - ``` - - Then, this method will return: - ```python - { - "arg_name_1": "arg_value_1", - "arg_name_2": "arg_value_2", - } - ``` - which we could then pass as kwargs to `self._merge_kwargs` - """ - if len(args): - warnings.warn( - "Passing positional arguments to the processor call is now deprecated and will be disallowed in future versions. " - "Please pass all arguments as keyword arguments." - ) - if len(args) > len(self.optional_call_args): - raise ValueError( - f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call" - f"which will be matched with {' '.join(self.optional_call_args)} in the order they are passed." - f"However, got {len(args)} positional arguments instead." - "Please pass all arguments as keyword arguments instead (e.g. `processor(arg_name_1=..., arg_name_2=...))`." - ) - return {arg_name: arg_value for arg_value, arg_name in zip(args, self.optional_call_args)} - def apply_chat_template( self, conversation: Union[List[Dict[str, str]]], diff --git a/tests/models/udop/test_processor_udop.py b/tests/models/udop/test_processor_udop.py index 4d9fa43be04c3b..749ec7c3d6df78 100644 --- a/tests/models/udop/test_processor_udop.py +++ b/tests/models/udop/test_processor_udop.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import os import shutil import tempfile import unittest @@ -30,10 +32,9 @@ require_sentencepiece, require_tokenizers, require_torch, - require_vision, slow, ) -from transformers.utils import cached_property, is_pytesseract_available, is_torch_available +from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available, is_torch_available from ...test_processing_common import ProcessorTesterMixin @@ -54,19 +55,20 @@ class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase): tokenizer_class = UdopTokenizer rust_tokenizer_class = UdopTokenizerFast - processor_class = UdopProcessor maxDiff = None + processor_class = UdopProcessor def setUp(self): + image_processor_map = { + "do_resize": True, + "size": 224, + "apply_ocr": True, + } + self.tmpdirname = tempfile.mkdtemp() - image_processor = LayoutLMv3ImageProcessor( - do_resize=True, - size=224, - apply_ocr=True, - ) - tokenizer = UdopTokenizer.from_pretrained("microsoft/udop-large") - processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer) - processor.save_pretrained(self.tmpdirname) + self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME) + with open(self.feature_extraction_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(image_processor_map) + "\n") self.tokenizer_pretrained_name = "microsoft/udop-large" @@ -78,15 +80,15 @@ def setUp(self): def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer: return self.tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs) - def get_image_processor(self, **kwargs): - return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs) - def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: return self.rust_tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs) def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]: return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)] + def get_image_processor(self, **kwargs): + return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs) + def tearDown(self): shutil.rmtree(self.tmpdirname) @@ -151,7 +153,7 @@ def test_model_input_names(self): input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(images=image_input, text=input_str) + inputs = processor(text=input_str, images=image_input) self.assertListEqual(list(inputs.keys()), processor.model_input_names) @@ -206,31 +208,6 @@ def preprocess_data(examples): self.assertEqual(len(train_data["pixel_values"]), len(train_data["input_ids"])) - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 - inputs = processor( - images=image_input, - text=input_str, - return_tensors="pt", - size={"height": 214, "width": 214}, - padding="longest", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 5) - # different use cases tests @require_sentencepiece @@ -495,7 +472,7 @@ def test_processor_case_5(self): question = "What's his name?" words = ["hello", "world"] boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] - input_processor = processor(images[0], question, text_pair=words, boxes=boxes, return_tensors="pt") + input_processor = processor(images[0], question, words, boxes, return_tensors="pt") # verify keys expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"] @@ -511,9 +488,7 @@ def test_processor_case_5(self): questions = ["How old is he?", "what's the time"] words = [["hello", "world"], ["my", "name", "is", "niels"]] boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]] - input_processor = processor( - images, questions, text_pair=words, boxes=boxes, padding=True, return_tensors="pt" - ) + input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt") # verify keys expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"] diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 4bee79e3bdeb11..b8ca7a6d6733fe 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -18,31 +18,26 @@ import json import tempfile + +try: + from typing import Unpack +except ImportError: + from typing_extensions import Unpack + import numpy as np -from transformers import CLIPTokenizerFast, ProcessorMixin from transformers.models.auto.processing_auto import processor_class_from_name from transformers.testing_utils import ( check_json_file_has_correct_format, - require_tokenizers, require_torch, require_vision, ) from transformers.utils import is_vision_available -try: - from typing import Unpack -except ImportError: - from typing_extensions import Unpack -import unittest - - if is_vision_available(): from PIL import Image - from transformers import CLIPImageProcessor - def prepare_image_inputs(): """This function prepares a list of PIL images""" @@ -55,9 +50,6 @@ def prepare_image_inputs(): @require_vision class ProcessorTesterMixin: processor_class = None - text_data_arg_name = "input_ids" - images_data_arg_name = "pixel_values" - videos_data_arg_name = "pixel_values_videos" def prepare_processor_dict(self): return {} @@ -137,42 +129,39 @@ def skip_processor_without_typed_kwargs(self, processor): def test_tokenizer_defaults_preserved_by_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(**processor_components) + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 117) + self.assertEqual(len(inputs["input_ids"][0]), 117) def test_image_processor_defaults_preserved_by_image_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor_components["image_processor"] = self.get_component( - "image_processor", size=(234, 234), crop_size=(234, 234) - ) - processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + image_processor = self.get_component("image_processor", size=(234, 234)) + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(**processor_components) + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 234) + inputs = processor(text=input_str, images=image_input) + self.assertEqual(len(inputs["pixel_values"][0][0]), 234) def test_kwargs_overrides_default_tokenizer_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", padding="longest") - processor = self.processor_class(**processor_components) + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -180,31 +169,30 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): inputs = processor( text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" ) - self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 112) + self.assertEqual(len(inputs["input_ids"][0]), 112) def test_kwargs_overrides_default_image_processor_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor_components["image_processor"] = self.get_component("image_processor", size=(234, 234)) - processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + image_processor = self.get_component("image_processor", size=(234, 234)) + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(**processor_components) + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224), return_tensors="pt" - ) - self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 224) + inputs = processor(text=input_str, images=image_input, size=[224, 224]) + self.assertEqual(len(inputs["pixel_values"][0][0]), 224) def test_unstructured_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor = self.processor_class(**processor_components) + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" @@ -214,19 +202,20 @@ def test_unstructured_kwargs(self): images=image_input, return_tensors="pt", size={"height": 214, "width": 214}, - crop_size={"height": 214, "width": 214}, padding="max_length", max_length=76, ) - self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) - self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) + self.assertEqual(inputs["pixel_values"].shape[2], 214) + self.assertEqual(len(inputs["input_ids"][0]), 76) def test_unstructured_kwargs_batched(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor = self.processor_class(**processor_components) + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer", "upper older longer string"] @@ -236,19 +225,21 @@ def test_unstructured_kwargs_batched(self): images=image_input, return_tensors="pt", size={"height": 214, "width": 214}, - crop_size={"height": 214, "width": 214}, padding="longest", max_length=76, ) - self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) - self.assertEqual(len(inputs[self.text_data_arg_name][0]), len(inputs[self.text_data_arg_name][1])) + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 6) def test_doubly_passed_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor = self.processor_class(**processor_components) + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer"] @@ -259,15 +250,15 @@ def test_doubly_passed_kwargs(self): images=image_input, images_kwargs={"size": {"height": 222, "width": 222}}, size={"height": 214, "width": 214}, - crop_size={"height": 214, "width": 214}, - return_tensors="pt", ) def test_structured_kwargs_nested(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor = self.processor_class(**processor_components) + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" @@ -276,84 +267,37 @@ def test_structured_kwargs_nested(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": { - "size": {"height": 214, "width": 214}, - "crop_size": {"height": 214, "width": 214}, - }, + "images_kwargs": {"size": {"height": 214, "width": 214}}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } inputs = processor(text=input_str, images=image_input, **all_kwargs) self.skip_processor_without_typed_kwargs(processor) - self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) - self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) def test_structured_kwargs_nested_from_dict(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor = self.processor_class(**processor_components) - self.skip_processor_without_typed_kwargs(processor) + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": { - "size": {"height": 214, "width": 214}, - "crop_size": {"height": 214, "width": 214}, - }, + "images_kwargs": {"size": {"height": 214, "width": 214}}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214) - self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76) - - -class MyProcessor(ProcessorMixin): - attributes = ["image_processor", "tokenizer"] - image_processor_class = "CLIPImageProcessor" - tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast") - - def __init__(self, image_processor=None, tokenizer=None, processor_attr_1=1, processor_attr_2=True): - super().__init__(image_processor, tokenizer) - - self.processor_attr_1 = processor_attr_1 - self.processor_attr_2 = processor_attr_2 - - -@require_tokenizers -@require_vision -class ProcessorTest(unittest.TestCase): - processor_class = MyProcessor - - def prepare_processor_dict(self): - return {"processor_attr_1": 1, "processor_attr_2": False} - - def get_processor(self): - image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") - tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14") - processor = MyProcessor(image_processor, tokenizer, **self.prepare_processor_dict()) - - return processor - - def test_processor_to_json_string(self): - processor = self.get_processor() - obj = json.loads(processor.to_json_string()) - for key, value in self.prepare_processor_dict().items(): - self.assertEqual(obj[key], value) - self.assertEqual(getattr(processor, key, None), value) - - def test_processor_from_and_save_pretrained(self): - processor_first = self.get_processor() - - with tempfile.TemporaryDirectory() as tmpdirname: - saved_file = processor_first.save_pretrained(tmpdirname)[0] - check_json_file_has_correct_format(saved_file) - processor_second = self.processor_class.from_pretrained(tmpdirname) + self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(processor_second.to_dict(), processor_first.to_dict()) + self.assertEqual(len(inputs["input_ids"][0]), 76)