diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py index 1c0c97d17d69d3..736a79f4882995 100644 --- a/src/transformers/models/udop/processing_udop.py +++ b/src/transformers/models/udop/processing_udop.py @@ -53,9 +53,7 @@ class UdopProcessorKwargs(ProcessingKwargs, total=False): "return_length": False, "verbose": True, }, - "images_kwargs": { - "num_image_tokens": 64, - }, + "images_kwargs": {}, } @@ -92,27 +90,8 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - # text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None, - # boxes: Union[List[List[int]], List[List[List[int]]]] = None, - # word_labels: Optional[Union[List[int], List[List[int]]]] = None, - # text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - # text_pair_target: Optional[ - # Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] - # ] = None, - # add_special_tokens: bool = True, - # padding: Union[bool, str, PaddingStrategy] = False, - # truncation: Union[bool, str, TruncationStrategy] = False, - # max_length: Optional[int] = None, - # stride: int = 0, - # pad_to_multiple_of: Optional[int] = None, - # return_token_type_ids: Optional[bool] = None, - # return_attention_mask: Optional[bool] = None, - # return_overflowing_tokens: bool = False, - # return_special_tokens_mask: bool = False, - # return_offsets_mapping: bool = False, - # return_length: bool = False, - # verbose: bool = True, - # return_tensors: Optional[Union[str, TensorType]] = None, + audio=None, + videos=None, **kwargs: Unpack[UdopProcessorKwargs], ) -> BatchFeature: """ @@ -158,22 +137,6 @@ def __call__( if output_kwargs["text_kwargs"].get("text_target", None) is not None: # use the processor to prepare the targets of UDOP return self.tokenizer( - # text_target=text_target, - # text_pair_target=text_pair_target, - # add_special_tokens=add_special_tokens, - # padding=padding, - # truncation=truncation, - # max_length=max_length, - # stride=stride, - # pad_to_multiple_of=pad_to_multiple_of, - # return_token_type_ids=return_token_type_ids, - # return_attention_mask=return_attention_mask, - # return_overflowing_tokens=return_overflowing_tokens, - # return_special_tokens_mask=return_special_tokens_mask, - # return_offsets_mapping=return_offsets_mapping, - # return_length=return_length, - # verbose=verbose, - # return_tensors=return_tensors, **output_kwargs["text_kwargs"], ) @@ -198,23 +161,6 @@ def __call__( encoded_inputs = self.tokenizer( text=text if text is not None else features_words, - # text_pair=text_pair, - # boxes=boxes if boxes is not None else features_boxes, - # word_labels=word_labels, - # add_special_tokens=add_special_tokens, - # padding=padding, - # truncation=truncation, - # max_length=max_length, - # stride=stride, - # pad_to_multiple_of=pad_to_multiple_of, - # return_token_type_ids=return_token_type_ids, - # return_attention_mask=return_attention_mask, - # return_overflowing_tokens=return_overflowing_tokens, - # return_special_tokens_mask=return_special_tokens_mask, - # return_offsets_mapping=return_offsets_mapping, - # return_length=return_length, - # verbose=verbose, - # return_tensors=return_tensors, **output_kwargs["text_kwargs"], ) @@ -275,4 +221,4 @@ def post_process_image_text_to_text(self, generated_outputs): @property # Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.model_input_names def model_input_names(self): - return ["pixel_values", "input_ids", "bbox", "attention_mask"] + return ["pixel_values", "input_ids", "attention_mask", "bbox"] diff --git a/tests/models/udop/test_processor_udop.py b/tests/models/udop/test_processor_udop.py index 38665dcc98296d..9f0fc8b45ebbcc 100644 --- a/tests/models/udop/test_processor_udop.py +++ b/tests/models/udop/test_processor_udop.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json -import os import shutil import tempfile import unittest @@ -22,9 +20,10 @@ import numpy as np from transformers import ( - PreTrainedTokenizer, + AutoProcessor, PreTrainedTokenizerBase, PreTrainedTokenizerFast, + UdopProcessor, UdopTokenizer, UdopTokenizerFast, ) @@ -33,9 +32,12 @@ require_sentencepiece, require_tokenizers, require_torch, + require_vision, slow, ) -from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available, is_torch_available +from transformers.utils import cached_property, is_pytesseract_available, is_torch_available + +from ...test_processing_common import ProcessorTesterMixin if is_torch_available(): @@ -51,37 +53,35 @@ @require_pytesseract @require_sentencepiece @require_tokenizers -class UdopProcessorTest(unittest.TestCase): - tokenizer_class = UdopTokenizer - rust_tokenizer_class = UdopTokenizerFast +class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = UdopProcessor maxDiff = None def setUp(self): - image_processor_map = { - "do_resize": True, - "size": 224, - "apply_ocr": True, - } - self.tmpdirname = tempfile.mkdtemp() - self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME) - with open(self.feature_extraction_file, "w", encoding="utf-8") as fp: - fp.write(json.dumps(image_processor_map) + "\n") + image_processor = LayoutLMv3ImageProcessor( + do_resize=True, + size=224, + apply_ocr=True, + ) + tokenizer = UdopTokenizer.from_pretrained("microsoft/udop-large") + processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer) + processor.save_pretrained(self.tmpdirname) self.tokenizer_pretrained_name = "microsoft/udop-large" - def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer: - return self.tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs) + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: - return self.rust_tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs) + return UdopTokenizerFast.from_pretrained(self.tokenizer_pretrained_name, **kwargs) def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]: return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)] - def get_image_processor(self, **kwargs): - return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs) - def tearDown(self): shutil.rmtree(self.tmpdirname) @@ -212,6 +212,179 @@ def preprocess_data(examples): self.assertEqual(len(train_data["pixel_values"]), len(train_data["input_ids"])) + @require_vision + @require_torch + def test_tokenizer_defaults_preserved_by_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(len(inputs["input_ids"][0]), 117) + + @require_torch + @require_vision + def test_image_processor_defaults_preserved_by_image_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", size=(234, 234)) + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + self.assertEqual(len(inputs["pixel_values"][0][0]), 234) + + @require_vision + @require_torch + def test_kwargs_overrides_default_tokenizer_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor( + text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" + ) + self.assertEqual(len(inputs["input_ids"][0]), 112) + + @require_torch + @require_vision + def test_kwargs_overrides_default_image_processor_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", size=(234, 234)) + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, size=[224, 224]) + self.assertEqual(len(inputs["pixel_values"][0][0]), 224) + + @require_torch + @require_vision + def test_unstructured_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + size={"height": 214, "width": 214}, + padding="max_length", + max_length=76, + ) + + self.assertEqual(inputs["pixel_values"].shape[2], 214) + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = ["lower newer", "upper older longer string"] + image_input = self.prepare_image_inputs() * 2 + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + size={"height": 214, "width": 214}, + padding="longest", + max_length=76, + ) + + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 5) + + @require_torch + @require_vision + def test_structured_kwargs_nested(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"size": {"height": 214, "width": 214}}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.skip_processor_without_typed_kwargs(processor) + + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_structured_kwargs_nested_from_dict(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"size": {"height": 214, "width": 214}}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + # different use cases tests @require_sentencepiece