Skip to content

Commit

Permalink
Add tests processor Udop
Browse files Browse the repository at this point in the history
  • Loading branch information
yonigozlan committed Aug 9, 2024
1 parent 22b5295 commit d29e2bb
Show file tree
Hide file tree
Showing 2 changed files with 199 additions and 80 deletions.
62 changes: 4 additions & 58 deletions src/transformers/models/udop/processing_udop.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,7 @@ class UdopProcessorKwargs(ProcessingKwargs, total=False):
"return_length": False,
"verbose": True,
},
"images_kwargs": {
"num_image_tokens": 64,
},
"images_kwargs": {},
}


Expand Down Expand Up @@ -92,27 +90,8 @@ def __call__(
self,
images: Optional[ImageInput] = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
# text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
# boxes: Union[List[List[int]], List[List[List[int]]]] = None,
# word_labels: Optional[Union[List[int], List[List[int]]]] = None,
# text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
# text_pair_target: Optional[
# Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
# ] = None,
# add_special_tokens: bool = True,
# padding: Union[bool, str, PaddingStrategy] = False,
# truncation: Union[bool, str, TruncationStrategy] = False,
# max_length: Optional[int] = None,
# stride: int = 0,
# pad_to_multiple_of: Optional[int] = None,
# return_token_type_ids: Optional[bool] = None,
# return_attention_mask: Optional[bool] = None,
# return_overflowing_tokens: bool = False,
# return_special_tokens_mask: bool = False,
# return_offsets_mapping: bool = False,
# return_length: bool = False,
# verbose: bool = True,
# return_tensors: Optional[Union[str, TensorType]] = None,
audio=None,
videos=None,
**kwargs: Unpack[UdopProcessorKwargs],
) -> BatchFeature:
"""
Expand Down Expand Up @@ -158,22 +137,6 @@ def __call__(
if output_kwargs["text_kwargs"].get("text_target", None) is not None:
# use the processor to prepare the targets of UDOP
return self.tokenizer(
# text_target=text_target,
# text_pair_target=text_pair_target,
# add_special_tokens=add_special_tokens,
# padding=padding,
# truncation=truncation,
# max_length=max_length,
# stride=stride,
# pad_to_multiple_of=pad_to_multiple_of,
# return_token_type_ids=return_token_type_ids,
# return_attention_mask=return_attention_mask,
# return_overflowing_tokens=return_overflowing_tokens,
# return_special_tokens_mask=return_special_tokens_mask,
# return_offsets_mapping=return_offsets_mapping,
# return_length=return_length,
# verbose=verbose,
# return_tensors=return_tensors,
**output_kwargs["text_kwargs"],
)

Expand All @@ -198,23 +161,6 @@ def __call__(

encoded_inputs = self.tokenizer(
text=text if text is not None else features_words,
# text_pair=text_pair,
# boxes=boxes if boxes is not None else features_boxes,
# word_labels=word_labels,
# add_special_tokens=add_special_tokens,
# padding=padding,
# truncation=truncation,
# max_length=max_length,
# stride=stride,
# pad_to_multiple_of=pad_to_multiple_of,
# return_token_type_ids=return_token_type_ids,
# return_attention_mask=return_attention_mask,
# return_overflowing_tokens=return_overflowing_tokens,
# return_special_tokens_mask=return_special_tokens_mask,
# return_offsets_mapping=return_offsets_mapping,
# return_length=return_length,
# verbose=verbose,
# return_tensors=return_tensors,
**output_kwargs["text_kwargs"],
)

Expand Down Expand Up @@ -275,4 +221,4 @@ def post_process_image_text_to_text(self, generated_outputs):
@property
# Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.model_input_names
def model_input_names(self):
return ["pixel_values", "input_ids", "bbox", "attention_mask"]
return ["pixel_values", "input_ids", "attention_mask", "bbox"]
217 changes: 195 additions & 22 deletions tests/models/udop/test_processor_udop.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import shutil
import tempfile
import unittest
Expand All @@ -22,9 +20,10 @@
import numpy as np

from transformers import (
PreTrainedTokenizer,
AutoProcessor,
PreTrainedTokenizerBase,
PreTrainedTokenizerFast,
UdopProcessor,
UdopTokenizer,
UdopTokenizerFast,
)
Expand All @@ -33,9 +32,12 @@
require_sentencepiece,
require_tokenizers,
require_torch,
require_vision,
slow,
)
from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available, is_torch_available
from transformers.utils import cached_property, is_pytesseract_available, is_torch_available

from ...test_processing_common import ProcessorTesterMixin


if is_torch_available():
Expand All @@ -51,37 +53,35 @@
@require_pytesseract
@require_sentencepiece
@require_tokenizers
class UdopProcessorTest(unittest.TestCase):
tokenizer_class = UdopTokenizer
rust_tokenizer_class = UdopTokenizerFast
class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = UdopProcessor
maxDiff = None

def setUp(self):
image_processor_map = {
"do_resize": True,
"size": 224,
"apply_ocr": True,
}

self.tmpdirname = tempfile.mkdtemp()
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(image_processor_map) + "\n")
image_processor = LayoutLMv3ImageProcessor(
do_resize=True,
size=224,
apply_ocr=True,
)
tokenizer = UdopTokenizer.from_pretrained("microsoft/udop-large")
processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
processor.save_pretrained(self.tmpdirname)

self.tokenizer_pretrained_name = "microsoft/udop-large"

def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
return self.tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer

def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor

def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
return UdopTokenizerFast.from_pretrained(self.tokenizer_pretrained_name, **kwargs)

def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]

def get_image_processor(self, **kwargs):
return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)

def tearDown(self):
shutil.rmtree(self.tmpdirname)

Expand Down Expand Up @@ -212,6 +212,179 @@ def preprocess_data(examples):

self.assertEqual(len(train_data["pixel_values"]), len(train_data["input_ids"]))

@require_vision
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()

inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(len(inputs["input_ids"][0]), 117)

@require_torch
@require_vision
def test_image_processor_defaults_preserved_by_image_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", size=(234, 234))
tokenizer = self.get_component("tokenizer", max_length=117)

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()

inputs = processor(text=input_str, images=image_input)
self.assertEqual(len(inputs["pixel_values"][0][0]), 234)

@require_vision
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=117)

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()

inputs = processor(
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
)
self.assertEqual(len(inputs["input_ids"][0]), 112)

@require_torch
@require_vision
def test_kwargs_overrides_default_image_processor_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", size=(234, 234))
tokenizer = self.get_component("tokenizer", max_length=117)

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()

inputs = processor(text=input_str, images=image_input, size=[224, 224])
self.assertEqual(len(inputs["pixel_values"][0][0]), 224)

@require_torch
@require_vision
def test_unstructured_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
size={"height": 214, "width": 214},
padding="max_length",
max_length=76,
)

self.assertEqual(inputs["pixel_values"].shape[2], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)

@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = ["lower newer", "upper older longer string"]
image_input = self.prepare_image_inputs() * 2
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
size={"height": 214, "width": 214},
padding="longest",
max_length=76,
)

self.assertEqual(inputs["pixel_values"].shape[2], 214)

self.assertEqual(len(inputs["input_ids"][0]), 5)

@require_torch
@require_vision
def test_structured_kwargs_nested(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()

# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"size": {"height": 214, "width": 214}},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}

inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.skip_processor_without_typed_kwargs(processor)

self.assertEqual(inputs["pixel_values"].shape[2], 214)

self.assertEqual(len(inputs["input_ids"][0]), 76)

@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")

image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()

# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"size": {"height": 214, "width": 214}},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}

inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(inputs["pixel_values"].shape[2], 214)

self.assertEqual(len(inputs["input_ids"][0]), 76)


# different use cases tests
@require_sentencepiece
Expand Down

0 comments on commit d29e2bb

Please sign in to comment.