-
Notifications
You must be signed in to change notification settings - Fork 26.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Uniformize kwargs for Udop processor and update docs #33628
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,10 +18,38 @@ | |
|
||
from typing import List, Optional, Union | ||
|
||
from transformers import logging | ||
|
||
from ...image_processing_utils import BatchFeature | ||
from ...image_utils import ImageInput | ||
from ...processing_utils import ProcessorMixin | ||
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy | ||
from ...utils import TensorType | ||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack | ||
from ...tokenization_utils_base import PreTokenizedInput, TextInput | ||
|
||
|
||
logger = logging.get_logger(__name__) | ||
|
||
|
||
class UdopTextKwargs(TextKwargs, total=False): | ||
word_labels: Optional[Union[List[int], List[List[int]]]] | ||
boxes: Union[List[List[int]], List[List[List[int]]]] | ||
|
||
|
||
class UdopProcessorKwargs(ProcessingKwargs, total=False): | ||
text_kwargs: UdopTextKwargs | ||
_defaults = { | ||
"text_kwargs": { | ||
"add_special_tokens": True, | ||
"padding": False, | ||
"truncation": False, | ||
"stride": 0, | ||
"return_overflowing_tokens": False, | ||
"return_special_tokens_mask": False, | ||
"return_offsets_mapping": False, | ||
"return_length": False, | ||
"verbose": True, | ||
}, | ||
"images_kwargs": {}, | ||
} | ||
|
||
|
||
class UdopProcessor(ProcessorMixin): | ||
|
@@ -49,6 +77,8 @@ class UdopProcessor(ProcessorMixin): | |
attributes = ["image_processor", "tokenizer"] | ||
image_processor_class = "LayoutLMv3ImageProcessor" | ||
tokenizer_class = ("UdopTokenizer", "UdopTokenizerFast") | ||
# For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details. | ||
optional_call_args = ["text_pair"] | ||
|
||
def __init__(self, image_processor, tokenizer): | ||
super().__init__(image_processor, tokenizer) | ||
|
@@ -57,28 +87,16 @@ def __call__( | |
self, | ||
images: Optional[ImageInput] = None, | ||
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, | ||
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None, | ||
boxes: Union[List[List[int]], List[List[List[int]]]] = None, | ||
word_labels: Optional[Union[List[int], List[List[int]]]] = None, | ||
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, | ||
text_pair_target: Optional[ | ||
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] | ||
] = None, | ||
add_special_tokens: bool = True, | ||
padding: Union[bool, str, PaddingStrategy] = False, | ||
truncation: Union[bool, str, TruncationStrategy] = False, | ||
max_length: Optional[int] = None, | ||
stride: int = 0, | ||
pad_to_multiple_of: Optional[int] = None, | ||
return_token_type_ids: Optional[bool] = None, | ||
return_attention_mask: Optional[bool] = None, | ||
return_overflowing_tokens: bool = False, | ||
return_special_tokens_mask: bool = False, | ||
return_offsets_mapping: bool = False, | ||
return_length: bool = False, | ||
verbose: bool = True, | ||
return_tensors: Optional[Union[str, TensorType]] = None, | ||
) -> BatchEncoding: | ||
# The following is to capture `text_pair` argument that may be passed as a positional argument. | ||
# See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details, | ||
# or this conversation for more context: https://github.com/huggingface/transformers/pull/32544#discussion_r1720208116 | ||
# This behavior is only needed for backward compatibility and will be removed in future versions. | ||
# | ||
*args, | ||
audio=None, | ||
videos=None, | ||
**kwargs: Unpack[UdopProcessorKwargs], | ||
) -> BatchFeature: | ||
""" | ||
This method first forwards the `images` argument to [`~UdopImageProcessor.__call__`]. In case | ||
[`UdopImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and | ||
|
@@ -93,6 +111,20 @@ def __call__( | |
Please refer to the docstring of the above two methods for more information. | ||
""" | ||
# verify input | ||
output_kwargs = self._merge_kwargs( | ||
UdopProcessorKwargs, | ||
tokenizer_init_kwargs=self.tokenizer.init_kwargs, | ||
**kwargs, | ||
**self.prepare_and_validate_optional_call_args(*args), | ||
) | ||
|
||
boxes = output_kwargs["text_kwargs"].pop("boxes", None) | ||
word_labels = output_kwargs["text_kwargs"].pop("word_labels", None) | ||
text_pair = output_kwargs["text_kwargs"].pop("text_pair", None) | ||
return_overflowing_tokens = output_kwargs["text_kwargs"].get("return_overflowing_tokens", False) | ||
return_offsets_mapping = output_kwargs["text_kwargs"].get("return_offsets_mapping", False) | ||
Comment on lines
+121
to
+125
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ugh, lots of additional args here - but no way around it I suppose. Maybe we could group a bit by default values using iterators, for style and loc, but minor nit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it's not very nice... But grouping by default values and doing list comps is a bit hard to read here I feel like |
||
text_target = output_kwargs["text_kwargs"].get("text_target", None) | ||
|
||
if self.image_processor.apply_ocr and (boxes is not None): | ||
raise ValueError( | ||
"You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True." | ||
|
@@ -103,69 +135,47 @@ def __call__( | |
"You cannot provide word labels if you initialized the image processor with apply_ocr set to True." | ||
) | ||
|
||
if return_overflowing_tokens is True and return_offsets_mapping is False: | ||
if return_overflowing_tokens and not return_offsets_mapping: | ||
raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.") | ||
|
||
if text_target is not None: | ||
# use the processor to prepare the targets of UDOP | ||
return self.tokenizer( | ||
text_target=text_target, | ||
text_pair_target=text_pair_target, | ||
add_special_tokens=add_special_tokens, | ||
padding=padding, | ||
truncation=truncation, | ||
max_length=max_length, | ||
stride=stride, | ||
pad_to_multiple_of=pad_to_multiple_of, | ||
return_token_type_ids=return_token_type_ids, | ||
return_attention_mask=return_attention_mask, | ||
return_overflowing_tokens=return_overflowing_tokens, | ||
return_special_tokens_mask=return_special_tokens_mask, | ||
return_offsets_mapping=return_offsets_mapping, | ||
return_length=return_length, | ||
verbose=verbose, | ||
return_tensors=return_tensors, | ||
**output_kwargs["text_kwargs"], | ||
) | ||
|
||
else: | ||
# use the processor to prepare the inputs of UDOP | ||
# first, apply the image processor | ||
features = self.image_processor(images=images, return_tensors=return_tensors) | ||
features = self.image_processor(images=images, **output_kwargs["images_kwargs"]) | ||
features_words = features.pop("words", None) | ||
features_boxes = features.pop("boxes", None) | ||
|
||
output_kwargs["text_kwargs"].pop("text_target", None) | ||
output_kwargs["text_kwargs"].pop("text_pair_target", None) | ||
output_kwargs["text_kwargs"]["text_pair"] = text_pair | ||
output_kwargs["text_kwargs"]["boxes"] = boxes if boxes is not None else features_boxes | ||
output_kwargs["text_kwargs"]["word_labels"] = word_labels | ||
|
||
# second, apply the tokenizer | ||
if text is not None and self.image_processor.apply_ocr and text_pair is None: | ||
if isinstance(text, str): | ||
text = [text] # add batch dimension (as the image processor always adds a batch dimension) | ||
text_pair = features["words"] | ||
output_kwargs["text_kwargs"]["text_pair"] = features_words | ||
|
||
encoded_inputs = self.tokenizer( | ||
text=text if text is not None else features["words"], | ||
text_pair=text_pair if text_pair is not None else None, | ||
boxes=boxes if boxes is not None else features["boxes"], | ||
word_labels=word_labels, | ||
add_special_tokens=add_special_tokens, | ||
padding=padding, | ||
truncation=truncation, | ||
max_length=max_length, | ||
stride=stride, | ||
pad_to_multiple_of=pad_to_multiple_of, | ||
return_token_type_ids=return_token_type_ids, | ||
return_attention_mask=return_attention_mask, | ||
return_overflowing_tokens=return_overflowing_tokens, | ||
return_special_tokens_mask=return_special_tokens_mask, | ||
return_offsets_mapping=return_offsets_mapping, | ||
return_length=return_length, | ||
verbose=verbose, | ||
return_tensors=return_tensors, | ||
text=text if text is not None else features_words, | ||
**output_kwargs["text_kwargs"], | ||
) | ||
|
||
# add pixel values | ||
pixel_values = features.pop("pixel_values") | ||
if return_overflowing_tokens is True: | ||
pixel_values = self.get_overflowing_images(pixel_values, encoded_inputs["overflow_to_sample_mapping"]) | ||
encoded_inputs["pixel_values"] = pixel_values | ||
features["pixel_values"] = self.get_overflowing_images( | ||
features["pixel_values"], encoded_inputs["overflow_to_sample_mapping"] | ||
) | ||
features.update(encoded_inputs) | ||
|
||
return encoded_inputs | ||
return features | ||
|
||
# Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.get_overflowing_images | ||
def get_overflowing_images(self, images, overflow_to_sample_mapping): | ||
|
@@ -198,7 +208,20 @@ def decode(self, *args, **kwargs): | |
""" | ||
return self.tokenizer.decode(*args, **kwargs) | ||
|
||
def post_process_image_text_to_text(self, generated_outputs): | ||
""" | ||
Post-process the output of the model to decode the text. | ||
Args: | ||
generated_outputs (`torch.Tensor` or `np.ndarray`): | ||
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` | ||
or `(sequence_length,)`. | ||
Returns: | ||
`List[str]`: The decoded text. | ||
""" | ||
return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) | ||
|
||
@property | ||
# Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.model_input_names | ||
def model_input_names(self): | ||
return ["input_ids", "bbox", "attention_mask", "pixel_values"] | ||
return ["pixel_values", "input_ids", "bbox", "attention_mask"] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
V. good comment 💯