From c0c6815dc98f10d60bf9927cea8d309c754474c5 Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Fri, 20 Sep 2024 11:40:59 -0400 Subject: [PATCH] Add support for args to ProcessorMixin for backward compatibility (#33479) * add check and prepare args for BC to ProcessorMixin, improve ProcessorTesterMixin * change size and crop_size in processor kwargs tests to do_rescale and rescale_factor * remove unnecessary llava processor kwargs test overwrite * nit * change data_arg_name to input_name * Remove unnecessary test override * Remove unnecessary tests Paligemma * Move test_prepare_and_validate_optional_call_args to TesterMixin, add docstring --- .../processing_llava_onevision.py | 5 +- src/transformers/processing_utils.py | 64 ++++++ .../models/altclip/test_processor_altclip.py | 115 +--------- .../test_processor_chinese_clip.py | 126 ----------- tests/models/llava/test_processor_llava.py | 28 +-- .../test_processing_llava_onevision.py | 203 +----------------- .../paligemma/test_processor_paligemma.py | 26 --- .../models/pixtral/test_processor_pixtral.py | 137 +----------- .../qwen2_vl/test_processing_qwen2_vl.py | 127 ----------- tests/test_processing_common.py | 154 +++++++------ 10 files changed, 173 insertions(+), 812 deletions(-) diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index d4ae02e0bb154c..2db0ba50c21042 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -57,11 +57,11 @@ class LlavaOnevisionProcessor(ProcessorMixin): r""" Constructs a LLaVa-Onevision processor which wraps a LLaVa-Onevision video processor, LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor. - [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the + [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaOnevisionImageProcessor`] and [`LlamaTokenizerFast`]. See the [`~LlavaOnevisionVideoProcessor.__call__`], [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information. Args: - image_processor ([`LlavaNextImageProcessor`], *optional*): + image_processor ([`LlavaOnevisionImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`LlamaTokenizerFast`], *optional*): The tokenizer is a required input. @@ -114,6 +114,7 @@ def __call__( self, images: ImageInput = None, text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + audio=None, videos: VideoInput = None, **kwargs: Unpack[LlavaOnevisionProcessorKwargs], ) -> BatchFeature: diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 53e83613a07c8f..3b8f2b0544a567 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -38,7 +38,9 @@ from .tokenization_utils_base import ( PaddingStrategy, + PreTokenizedInput, PreTrainedTokenizerBase, + TextInput, TruncationStrategy, ) from .utils import ( @@ -114,6 +116,9 @@ class TextKwargs(TypedDict, total=False): The side on which padding will be applied. """ + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] + text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] + text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] add_special_tokens: Optional[bool] padding: Union[bool, str, PaddingStrategy] truncation: Union[bool, str, TruncationStrategy] @@ -328,6 +333,7 @@ class ProcessorMixin(PushToHubMixin): attributes = ["feature_extractor", "tokenizer"] optional_attributes = ["chat_template"] + optional_call_args: List[str] = [] # Names need to be attr_class for attr in attributes feature_extractor_class = None tokenizer_class = None @@ -973,6 +979,64 @@ def validate_init_kwargs(processor_config, valid_kwargs): unused_kwargs = {k: processor_config[k] for k in unused_keys} return unused_kwargs + def prepare_and_validate_optional_call_args(self, *args): + """ + Matches optional positional arguments to their corresponding names in `optional_call_args` + in the processor class in the order they are passed to the processor call. + + Note that this should only be used in the `__call__` method of the processors with special + arguments. Special arguments are arguments that aren't `text`, `images`, `audio`, nor `videos` + but also aren't passed to the tokenizer, image processor, etc. Examples of such processors are: + - `CLIPSegProcessor` + - `LayoutLMv2Processor` + - `OwlViTProcessor` + + Also note that passing by position to the processor call is now deprecated and will be disallowed + in future versions. We only have this for backward compatibility. + + Example: + Suppose that the processor class has `optional_call_args = ["arg_name_1", "arg_name_2"]`. + And we define the call method as: + ```python + def __call__( + self, + text: str, + images: Optional[ImageInput] = None, + *arg, + audio=None, + videos=None, + ) + ``` + + Then, if we call the processor as: + ```python + images = [...] + processor("What is common in these images?", images, arg_value_1, arg_value_2) + ``` + + Then, this method will return: + ```python + { + "arg_name_1": arg_value_1, + "arg_name_2": arg_value_2, + } + ``` + which we could then pass as kwargs to `self._merge_kwargs` + """ + if len(args): + warnings.warn( + "Passing positional arguments to the processor call is now deprecated and will be disallowed in v4.47. " + "Please pass all arguments as keyword arguments." + ) + if len(args) > len(self.optional_call_args): + raise ValueError( + f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call" + f"which will be matched with {' '.join(self.optional_call_args)} in the order they are passed." + f"However, got {len(args)} positional arguments instead." + "Please pass all arguments as keyword arguments instead (e.g. `processor(arg_name_1=..., arg_name_2=...))`." + ) + return {arg_name: arg_value for arg_value, arg_name in zip(args, self.optional_call_args)} + def apply_chat_template( self, conversation: Union[List[Dict[str, str]]], diff --git a/tests/models/altclip/test_processor_altclip.py b/tests/models/altclip/test_processor_altclip.py index 1aca2280969404..33bff9c77ad263 100644 --- a/tests/models/altclip/test_processor_altclip.py +++ b/tests/models/altclip/test_processor_altclip.py @@ -18,7 +18,7 @@ import unittest from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast -from transformers.testing_utils import require_torch, require_vision +from transformers.testing_utils import require_vision from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -50,116 +50,3 @@ def get_rust_tokenizer(self, **kwargs): def get_image_processor(self, **kwargs): return CLIPImageProcessor.from_pretrained(self.model_id, **kwargs) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - crop_size={"height": 214, "width": 214}, - padding="longest", - max_length=76, - ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 7) - - def test_structured_kwargs_nested(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.skip_processor_without_typed_kwargs(processor) - - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 76) - - def test_structured_kwargs_nested_from_dict(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 76) - - def test_unstructured_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - crop_size={"height": 214, "width": 214}, - padding="max_length", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - def test_image_processor_defaults_preserved_by_image_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", crop_size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs["pixel_values"][0][0]), 234) diff --git a/tests/models/chinese_clip/test_processor_chinese_clip.py b/tests/models/chinese_clip/test_processor_chinese_clip.py index 5b191ce2df0894..e433c38f789104 100644 --- a/tests/models/chinese_clip/test_processor_chinese_clip.py +++ b/tests/models/chinese_clip/test_processor_chinese_clip.py @@ -206,129 +206,3 @@ def test_model_input_names(self): inputs = processor(text=input_str, images=image_input) self.assertListEqual(list(inputs.keys()), processor.model_input_names) - - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - crop_size={"height": 214, "width": 214}, - padding="longest", - max_length=76, - ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 6) - - def test_structured_kwargs_nested(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.skip_processor_without_typed_kwargs(processor) - - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 76) - - def test_structured_kwargs_nested_from_dict(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 76) - - def test_unstructured_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - crop_size={"height": 214, "width": 214}, - padding="max_length", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - def test_image_processor_defaults_preserved_by_image_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", crop_size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs["pixel_values"][0][0]), 234) - - def test_kwargs_overrides_default_image_processor_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", crop_size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, crop_size=[224, 224]) - self.assertEqual(len(inputs["pixel_values"][0][0]), 224) diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py index e62769e3450917..06a18061579670 100644 --- a/tests/models/llava/test_processor_llava.py +++ b/tests/models/llava/test_processor_llava.py @@ -17,7 +17,7 @@ import unittest from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor -from transformers.testing_utils import require_torch, require_vision +from transformers.testing_utils import require_vision from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -93,29 +93,3 @@ def test_chat_template(self): formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) self.assertEqual(expected_prompt, formatted_prompt) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 - inputs = processor( - images=image_input, - text=input_str, - return_tensors="pt", - size={"height": 214, "width": 214}, - padding="longest", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 5) diff --git a/tests/models/llava_onevision/test_processing_llava_onevision.py b/tests/models/llava_onevision/test_processing_llava_onevision.py index 1f998ca4bc04df..55f5980bfa1579 100644 --- a/tests/models/llava_onevision/test_processing_llava_onevision.py +++ b/tests/models/llava_onevision/test_processing_llava_onevision.py @@ -16,7 +16,7 @@ import tempfile import unittest -from transformers.testing_utils import require_torch, require_vision +from transformers.testing_utils import require_vision from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -100,204 +100,3 @@ def test_chat_template(self): formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True) self.assertEqual(expected_prompt, formatted_prompt) - - @require_torch - @require_vision - def test_image_processor_defaults_preserved_by_image_kwargs(self): - # Rewrite as llava-next image processor return pixel values with an added dimesion for image patches - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size=(234, 234)) - video_processor = self.get_component("video_processor", size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117) - - processor = self.processor_class( - tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor - ) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - # added dimension for image patches - self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234) - - @require_torch - @require_vision - def test_kwargs_overrides_default_image_processor_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", crop_size=(234, 234)) - video_processor = self.get_component("video_processor", size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117) - - processor = self.processor_class( - tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor - ) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, size=[224, 224]) - # added dimension for image patches - self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224) - - @require_torch - @require_vision - def test_unstructured_kwargs(self): - image_processor = self.get_component("image_processor") - video_processor = self.get_component("video_processor") - tokenizer = self.get_component("tokenizer") - processor = self.processor_class( - tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor - ) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - size={"height": 214, "width": 214}, - padding="max_length", - max_length=76, - ) - - # added dimension for image patches - self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - image_processor = self.get_component("image_processor") - video_processor = self.get_component("video_processor") - tokenizer = self.get_component("tokenizer") - processor = self.processor_class( - tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor - ) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - size={"height": 214, "width": 214}, - padding="longest", - max_length=76, - ) - self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 4) - - @require_torch - @require_vision - def test_structured_kwargs_nested(self): - image_processor = self.get_component("image_processor") - video_processor = self.get_component("video_processor") - tokenizer = self.get_component("tokenizer") - processor = self.processor_class( - tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor - ) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 214, "width": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.skip_processor_without_typed_kwargs(processor) - - self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - @require_vision - def test_structured_kwargs_nested_from_dict(self): - image_processor = self.get_component("image_processor") - video_processor = self.get_component("video_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class( - tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor - ) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 214, "width": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - @require_vision - def test_doubly_passed_kwargs(self): - image_processor = self.get_component("image_processor") - video_processor = self.get_component("video_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class( - tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor - ) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer"] - image_input = self.prepare_image_inputs() - with self.assertRaises(ValueError): - _ = processor( - text=input_str, - images=image_input, - images_kwargs={"size": {"height": 222, "width": 222}}, - size={"height": 214, "width": 214}, - ) - - @require_vision - @require_torch - def test_kwargs_overrides_default_tokenizer_kwargs(self): - image_processor = self.get_component("image_processor") - video_processor = self.get_component("video_processor") - tokenizer = self.get_component("tokenizer", max_length=117) - - processor = self.processor_class( - tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor - ) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112) - self.assertEqual(len(inputs["input_ids"][0]), 2) - - @require_vision - @require_torch - def test_tokenizer_defaults_preserved_by_kwargs(self): - image_processor = self.get_component("image_processor") - video_processor = self.get_component("video_processor") - tokenizer = self.get_component("tokenizer", max_length=117) - - processor = self.processor_class( - tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor - ) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(len(inputs["input_ids"][0]), 2) diff --git a/tests/models/paligemma/test_processor_paligemma.py b/tests/models/paligemma/test_processor_paligemma.py index 47810f1832416f..60de913e53ae9b 100644 --- a/tests/models/paligemma/test_processor_paligemma.py +++ b/tests/models/paligemma/test_processor_paligemma.py @@ -61,29 +61,3 @@ def test_image_seq_length(self): text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" ) self.assertEqual(len(inputs["input_ids"][0]), 112 + 14) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - size={"height": 214, "width": 214}, - padding="longest", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 10) diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py index 04aa3ee8a38b4e..29575b49367268 100644 --- a/tests/models/pixtral/test_processor_pixtral.py +++ b/tests/models/pixtral/test_processor_pixtral.py @@ -19,7 +19,6 @@ import torch from transformers.testing_utils import ( - require_torch, require_vision, ) from transformers.utils import is_vision_available @@ -248,144 +247,28 @@ def test_processor_with_multiple_images_multiple_lists(self): ) # fmt: on - # Override all tests requiring shape as returning tensor batches is not supported by PixtralProcessor - - @require_torch - @require_vision - def test_image_processor_defaults_preserved_by_image_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size={"height": 240, "width": 240}) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - # Added dimension by pixtral image processor - self.assertEqual(len(inputs["pixel_values"][0][0][0][0]), 240) - - @require_torch - @require_vision - def test_kwargs_overrides_default_image_processor_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size={"height": 400, "width": 400}) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, size={"height": 240, "width": 240}) - self.assertEqual(len(inputs["pixel_values"][0][0][0][0]), 240) - - @require_torch - @require_vision - def test_structured_kwargs_nested(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 240, "width": 240}}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.skip_processor_without_typed_kwargs(processor) - - self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240) - - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - @require_vision - def test_structured_kwargs_nested_from_dict(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 240, "width": 240}}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240) - - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - @require_vision - def test_unstructured_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - size={"height": 240, "width": 240}, - padding="max_length", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - @require_torch - @require_vision + # Override as PixtralProcessor needs nested images to work properly with batched inputs def test_unstructured_kwargs_batched(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer", "upper older longer string"] - # images needs to be nested to detect multiple prompts image_input = [self.prepare_image_inputs()] * 2 inputs = processor( text=input_str, images=image_input, return_tensors="pt", - size={"height": 240, "width": 240}, + do_rescale=True, + rescale_factor=-1, padding="longest", max_length=76, ) - self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240) - self.assertEqual(len(inputs["input_ids"][0]), 4) + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + self.assertTrue( + len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1]) + and len(inputs[self.text_input_name][1]) < 76 + ) diff --git a/tests/models/qwen2_vl/test_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_processing_qwen2_vl.py index d1ae16a9aa46e2..a360fc98f4c584 100644 --- a/tests/models/qwen2_vl/test_processing_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_processing_qwen2_vl.py @@ -108,130 +108,3 @@ def test_model_input_names(self): inputs = processor(text=input_str, images=image_input, videos=video_inputs) self.assertListEqual(list(inputs.keys()), processor.model_input_names) - - # Qwen2-VL doesn't accept `size` and resized to an optimal size using image_processor attrbutes - # defined at `init`. Therefore, all tests are overwritten and don't actually test if kwargs are passed - # to image processors - def test_image_processor_defaults_preserved_by_image_kwargs(self): - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - self.assertEqual(inputs["pixel_values"].shape[0], 800) - - def test_kwargs_overrides_default_image_processor_kwargs(self): - image_processor = self.get_component( - "image_processor", - ) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - self.assertEqual(inputs["pixel_values"].shape[0], 800) - - def test_unstructured_kwargs(self): - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - padding="max_length", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[0], 800) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - padding="longest", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[0], 1600) - self.assertEqual(len(inputs["input_ids"][0]), 4) - - def test_structured_kwargs_nested(self): - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.skip_processor_without_typed_kwargs(processor) - - self.assertEqual(inputs["pixel_values"].shape[0], 800) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - def test_structured_kwargs_nested_from_dict(self): - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[0], 800) - self.assertEqual(len(inputs["input_ids"][0]), 76) - - def test_image_processor_defaults_preserved_by_video_kwargs(self): - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - video_input = self.prepare_video_inputs() - - inputs = processor(text=input_str, videos=video_input) - self.assertEqual(inputs["pixel_values_videos"].shape[0], 9600) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index fe17de4eeb5cdf..a51c1d200eb0aa 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -18,12 +18,6 @@ import json import tempfile - -try: - from typing import Unpack -except ImportError: - from typing_extensions import Unpack - import numpy as np from transformers.models.auto.processing_auto import processor_class_from_name @@ -35,6 +29,12 @@ from transformers.utils import is_vision_available +try: + from typing import Unpack +except ImportError: + from typing_extensions import Unpack + + if is_vision_available(): from PIL import Image @@ -50,6 +50,9 @@ def prepare_image_inputs(): @require_vision class ProcessorTesterMixin: processor_class = None + text_input_name = "input_ids" + images_input_name = "pixel_values" + videos_input_name = "pixel_values_videos" def prepare_processor_dict(self): return {} @@ -139,68 +142,77 @@ def skip_processor_without_typed_kwargs(self, processor): def test_tokenizer_defaults_preserved_by_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + processor_components = self.prepare_components() + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(len(inputs["input_ids"][0]), 117) + self.assertEqual(inputs[self.text_input_name].shape[-1], 117) def test_image_processor_defaults_preserved_by_image_kwargs(self): + """ + We use do_rescale=True, rescale_factor=-1 to ensure that image_processor kwargs are preserved in the processor. + We then check that the mean of the pixel_values is less than or equal to 0 after processing. + Since the original pixel_values are in [0, 255], this is a good indicator that the rescale_factor is indeed applied. + """ if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component( + "image_processor", do_rescale=True, rescale_factor=-1 + ) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs["pixel_values"][0][0]), 234) + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) def test_kwargs_overrides_default_tokenizer_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", padding="longest") + processor_components = self.prepare_components() + processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() inputs = processor( text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" ) - self.assertEqual(len(inputs["input_ids"][0]), 112) + self.assertEqual(inputs[self.text_input_name].shape[-1], 112) def test_kwargs_overrides_default_image_processor_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", size=(234, 234)) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component( + "image_processor", do_rescale=True, rescale_factor=1 + ) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, size=[224, 224]) - self.assertEqual(len(inputs["pixel_values"][0][0]), 224) + inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt") + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) def test_unstructured_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" @@ -209,21 +221,20 @@ def test_unstructured_kwargs(self): text=input_str, images=image_input, return_tensors="pt", - size={"height": 214, "width": 214}, + do_rescale=True, + rescale_factor=-1, padding="max_length", max_length=76, ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 76) + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + self.assertEqual(inputs[self.text_input_name].shape[-1], 76) def test_unstructured_kwargs_batched(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer", "upper older longer string"] @@ -232,21 +243,23 @@ def test_unstructured_kwargs_batched(self): text=input_str, images=image_input, return_tensors="pt", - size={"height": 214, "width": 214}, + do_rescale=True, + rescale_factor=-1, padding="longest", max_length=76, ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 6) + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + self.assertTrue( + len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1]) + and len(inputs[self.text_input_name][1]) < 76 + ) def test_doubly_passed_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer"] @@ -255,17 +268,16 @@ def test_doubly_passed_kwargs(self): _ = processor( text=input_str, images=image_input, - images_kwargs={"size": {"height": 222, "width": 222}}, - size={"height": 214, "width": 214}, + images_kwargs={"do_rescale": True, "rescale_factor": -1}, + do_rescale=True, + return_tensors="pt", ) def test_structured_kwargs_nested(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" @@ -274,25 +286,21 @@ def test_structured_kwargs_nested(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 214, "width": 214}}, + "images_kwargs": {"do_rescale": True, "rescale_factor": -1}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } inputs = processor(text=input_str, images=image_input, **all_kwargs) self.skip_processor_without_typed_kwargs(processor) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 76) + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + self.assertEqual(inputs[self.text_input_name].shape[-1], 76) def test_structured_kwargs_nested_from_dict(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + processor_components = self.prepare_components() + processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() @@ -300,14 +308,13 @@ def test_structured_kwargs_nested_from_dict(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 214, "width": 214}}, + "images_kwargs": {"do_rescale": True, "rescale_factor": -1}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 76) + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + self.assertEqual(inputs[self.text_input_name].shape[-1], 76) # TODO: the same test, but for audio + text processors that have strong overlap in kwargs # TODO (molbap) use the same structure of attribute kwargs for other tests to avoid duplication @@ -335,3 +342,28 @@ def test_overlapping_text_kwargs_handling(self): padding="max_length", text_kwargs={"padding": "do_not_pad"}, ) + + def test_prepare_and_validate_optional_call_args(self): + processor = self.get_processor() + optional_call_args_name = getattr(processor, "optional_call_args", []) + num_optional_call_args = len(optional_call_args_name) + if num_optional_call_args == 0: + self.skipTest("No optional call args") + # test all optional call args are given + optional_call_args = processor.prepare_and_validate_optional_call_args( + *(f"optional_{i}" for i in range(num_optional_call_args)) + ) + self.assertEqual( + optional_call_args, {arg_name: f"optional_{i}" for i, arg_name in enumerate(optional_call_args_name)} + ) + # test only one optional call arg is given + optional_call_args = processor.prepare_and_validate_optional_call_args("optional_1") + self.assertEqual(optional_call_args, {optional_call_args_name[0]: "optional_1"}) + # test no optional call arg is given + optional_call_args = processor.prepare_and_validate_optional_call_args() + self.assertEqual(optional_call_args, {}) + # test too many optional call args are given + with self.assertRaises(ValueError): + processor.prepare_and_validate_optional_call_args( + *(f"optional_{i}" for i in range(num_optional_call_args + 1)) + )