From f5d8507bed8db3131ca4b52c5bf4137c867a8331 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Mon, 16 Sep 2024 19:47:59 +0000
Subject: [PATCH] remove optional args and udop uniformization from this PR

---
 src/transformers/models/udop/modeling_udop.py |   2 +-
 .../models/udop/processing_udop.py            | 163 +++++++----------
 src/transformers/processing_utils.py          |  64 -------
 tests/models/udop/test_processor_udop.py      |  63 ++-----
 tests/test_processing_common.py               | 170 ++++++------------
 5 files changed, 145 insertions(+), 317 deletions(-)

diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py
index 6f7b6cf060495a..972248daaae599 100644
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@@ -1790,7 +1790,7 @@ def forward(
         >>> # one can use the various task prefixes (prompts) used during pre-training
         >>> # e.g. the task prefix for DocVQA is "Question answering. "
         >>> question = "Question answering. What is the date on the form?"
-        >>> encoding = processor(image, question, text_pair=words, boxes=boxes, return_tensors="pt")
+        >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
 
         >>> # autoregressive generation
         >>> predicted_ids = model.generate(**encoding)
diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py
index 3d4cfc9ce4334e..2902541d6f5b46 100644
--- a/src/transformers/models/udop/processing_udop.py
+++ b/src/transformers/models/udop/processing_udop.py
@@ -16,47 +16,12 @@
 Processor class for UDOP.
 """
 
-import sys
 from typing import List, Optional, Union
 
-from transformers import logging
-
-from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs
-from ...tokenization_utils_base import PreTokenizedInput, TextInput
-
-
-if sys.version_info >= (3, 11):
-    from typing import Unpack
-else:
-    from typing_extensions import Unpack
-
-
-logger = logging.get_logger(__name__)
-
-
-class UdopTextKwargs(TextKwargs, total=False):
-    word_labels: Optional[Union[List[int], List[List[int]]]]
-    boxes: Union[List[List[int]], List[List[List[int]]]]
-
-
-class UdopProcessorKwargs(ProcessingKwargs, total=False):
-    text_kwargs: UdopTextKwargs
-    _defaults = {
-        "text_kwargs": {
-            "add_special_tokens": True,
-            "padding": False,
-            "truncation": False,
-            "stride": 0,
-            "return_overflowing_tokens": False,
-            "return_special_tokens_mask": False,
-            "return_offsets_mapping": False,
-            "return_length": False,
-            "verbose": True,
-        },
-        "images_kwargs": {},
-    }
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
 
 
 class UdopProcessor(ProcessorMixin):
@@ -84,8 +49,6 @@ class UdopProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "LayoutLMv3ImageProcessor"
     tokenizer_class = ("UdopTokenizer", "UdopTokenizerFast")
-    # For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
-    optional_call_args = ["text_pair"]
 
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
@@ -94,14 +57,28 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        # The following is to capture `text_pair` argument that may be passed as a positional argument.
-        # See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
-        # This behavior is only needed for backward compatibility and will be removed in future versions.
-        *args,
-        audio=None,
-        videos=None,
-        **kwargs: Unpack[UdopProcessorKwargs],
-    ) -> BatchFeature:
+        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
+        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
+        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
+        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        text_pair_target: Optional[
+            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
+        ] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchEncoding:
         """
         This method first forwards the `images` argument to [`~UdopImageProcessor.__call__`]. In case
         [`UdopImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
@@ -116,19 +93,6 @@ def __call__(
         Please refer to the docstring of the above two methods for more information.
         """
         # verify input
-        output_kwargs = self._merge_kwargs(
-            UdopProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-            **self.prepare_and_validate_optional_call_args(*args),
-        )
-
-        boxes = output_kwargs["text_kwargs"].pop("boxes", None)
-        word_labels = output_kwargs["text_kwargs"].pop("word_labels", None)
-        text_pair = output_kwargs["text_kwargs"].pop("text_pair", None)
-        return_overflowing_tokens = output_kwargs["text_kwargs"].get("return_overflowing_tokens", False)
-        return_offsets_mapping = output_kwargs["text_kwargs"].get("return_offsets_mapping", False)
-
         if self.image_processor.apply_ocr and (boxes is not None):
             raise ValueError(
                 "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
@@ -142,44 +106,66 @@ def __call__(
         if return_overflowing_tokens is True and return_offsets_mapping is False:
             raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
 
-        if output_kwargs["text_kwargs"].get("text_target", None) is not None:
+        if text_target is not None:
             # use the processor to prepare the targets of UDOP
             return self.tokenizer(
-                **output_kwargs["text_kwargs"],
+                text_target=text_target,
+                text_pair_target=text_pair_target,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
             )
 
         else:
             # use the processor to prepare the inputs of UDOP
             # first, apply the image processor
-            features = self.image_processor(images=images, **output_kwargs["images_kwargs"])
-            features_words = features.pop("words", None)
-            features_boxes = features.pop("boxes", None)
-
-            _ = output_kwargs["text_kwargs"].pop("text_target", None)
-            _ = output_kwargs["text_kwargs"].pop("text_pair_target", None)
-            output_kwargs["text_kwargs"]["text_pair"] = text_pair
-            output_kwargs["text_kwargs"]["boxes"] = boxes if boxes is not None else features_boxes
-            output_kwargs["text_kwargs"]["word_labels"] = word_labels
+            features = self.image_processor(images=images, return_tensors=return_tensors)
 
             # second, apply the tokenizer
             if text is not None and self.image_processor.apply_ocr and text_pair is None:
                 if isinstance(text, str):
                     text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
-                output_kwargs["text_kwargs"]["text_pair"] = features_words
+                text_pair = features["words"]
 
             encoded_inputs = self.tokenizer(
-                text=text if text is not None else features_words,
-                **output_kwargs["text_kwargs"],
+                text=text if text is not None else features["words"],
+                text_pair=text_pair if text_pair is not None else None,
+                boxes=boxes if boxes is not None else features["boxes"],
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                return_tensors=return_tensors,
             )
 
             # add pixel values
+            pixel_values = features.pop("pixel_values")
             if return_overflowing_tokens is True:
-                features["pixel_values"] = self.get_overflowing_images(
-                    features["pixel_values"], encoded_inputs["overflow_to_sample_mapping"]
-                )
-            features.update(encoded_inputs)
+                pixel_values = self.get_overflowing_images(pixel_values, encoded_inputs["overflow_to_sample_mapping"])
+            encoded_inputs["pixel_values"] = pixel_values
 
-            return features
+            return encoded_inputs
 
     # Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.get_overflowing_images
     def get_overflowing_images(self, images, overflow_to_sample_mapping):
@@ -212,20 +198,7 @@ def decode(self, *args, **kwargs):
         """
         return self.tokenizer.decode(*args, **kwargs)
 
-    def post_process_image_text_to_text(self, generated_outputs):
-        """
-        Post-process the output of the model to decode the text.
-
-        Args:
-            generated_outputs (`torch.Tensor` or `np.ndarray`):
-                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
-                or `(sequence_length,)`.
-
-        Returns:
-            `List[str]`: The decoded text.
-        """
-        return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True)
-
     @property
+    # Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.model_input_names
     def model_input_names(self):
-        return ["pixel_values", "input_ids", "bbox", "attention_mask"]
+        return ["input_ids", "bbox", "attention_mask", "pixel_values"]
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index b099476802ecb7..ee28c01189b439 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -35,9 +35,7 @@
 
 from .tokenization_utils_base import (
     PaddingStrategy,
-    PreTokenizedInput,
     PreTrainedTokenizerBase,
-    TextInput,
     TruncationStrategy,
 )
 from .utils import (
@@ -108,9 +106,6 @@ class TextKwargs(TypedDict, total=False):
             The side on which padding will be applied.
     """
 
-    text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
-    text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
-    text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
     add_special_tokens: Optional[bool]
     padding: Union[bool, str, PaddingStrategy]
     truncation: Union[bool, str, TruncationStrategy]
@@ -322,7 +317,6 @@ class ProcessorMixin(PushToHubMixin):
 
     attributes = ["feature_extractor", "tokenizer"]
     optional_attributes = ["chat_template"]
-    optional_call_args: List[str] = []
     # Names need to be attr_class for attr in attributes
     feature_extractor_class = None
     tokenizer_class = None
@@ -964,64 +958,6 @@ def validate_init_kwargs(processor_config, valid_kwargs):
             unused_kwargs = {k: processor_config[k] for k in unused_keys}
         return unused_kwargs
 
-    def prepare_and_validate_optional_call_args(self, *args):
-        """
-        Matches optional positional arguments to their corresponding names in `optional_call_args`
-        in the processor class in the order they are passed to the processor call.
-
-        Note that this should only be used in the `__call__` method of the processors with special
-        arguments. Special arguments are arguments that aren't `text`, `images`, `audio`, nor `videos`
-        but also aren't passed to the tokenizer, image processor, etc. Examples of such processors are:
-            - `CLIPSegProcessor`
-            - `LayoutLMv2Processor`
-            - `OwlViTProcessor`
-
-        Also note that passing by position to the processor call is now deprecated and will be disallowed
-        in future versions. We only have this for backward compatibility.
-
-        Example:
-            Suppose that the processor class has `optional_call_args = ["arg_name_1", "arg_name_2"]`.
-            And we define the call method as:
-            ```python
-            def __call__(
-                self,
-                text: str,
-                images: Optional[ImageInput] = None,
-                *arg,
-                audio=None,
-                videos=None,
-            )
-            ```
-
-            Then, if we call the processor as:
-            ```python
-            images = [...]
-            processor("What is common in these images?", images, "arg_value_1", "arg_value_2")
-            ```
-
-            Then, this method will return:
-            ```python
-            {
-                "arg_name_1": "arg_value_1",
-                "arg_name_2": "arg_value_2",
-            }
-            ```
-            which we could then pass as kwargs to `self._merge_kwargs`
-        """
-        if len(args):
-            warnings.warn(
-                "Passing positional arguments to the processor call is now deprecated and will be disallowed in future versions. "
-                "Please pass all arguments as keyword arguments."
-            )
-        if len(args) > len(self.optional_call_args):
-            raise ValueError(
-                f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call"
-                f"which will be matched with {' '.join(self.optional_call_args)} in the order they are passed."
-                f"However, got {len(args)} positional arguments instead."
-                "Please pass all arguments as keyword arguments instead (e.g. `processor(arg_name_1=..., arg_name_2=...))`."
-            )
-        return {arg_name: arg_value for arg_value, arg_name in zip(args, self.optional_call_args)}
-
     def apply_chat_template(
         self,
         conversation: Union[List[Dict[str, str]]],
diff --git a/tests/models/udop/test_processor_udop.py b/tests/models/udop/test_processor_udop.py
index 4d9fa43be04c3b..749ec7c3d6df78 100644
--- a/tests/models/udop/test_processor_udop.py
+++ b/tests/models/udop/test_processor_udop.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
+import os
 import shutil
 import tempfile
 import unittest
@@ -30,10 +32,9 @@
     require_sentencepiece,
     require_tokenizers,
     require_torch,
-    require_vision,
     slow,
 )
-from transformers.utils import cached_property, is_pytesseract_available, is_torch_available
+from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available, is_torch_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
@@ -54,19 +55,20 @@
 class UdopProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     tokenizer_class = UdopTokenizer
     rust_tokenizer_class = UdopTokenizerFast
-    processor_class = UdopProcessor
     maxDiff = None
+    processor_class = UdopProcessor
 
     def setUp(self):
+        image_processor_map = {
+            "do_resize": True,
+            "size": 224,
+            "apply_ocr": True,
+        }
+
         self.tmpdirname = tempfile.mkdtemp()
-        image_processor = LayoutLMv3ImageProcessor(
-            do_resize=True,
-            size=224,
-            apply_ocr=True,
-        )
-        tokenizer = UdopTokenizer.from_pretrained("microsoft/udop-large")
-        processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
-        processor.save_pretrained(self.tmpdirname)
+        self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
+        with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(image_processor_map) + "\n")
 
         self.tokenizer_pretrained_name = "microsoft/udop-large"
 
@@ -78,15 +80,15 @@ def setUp(self):
     def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
         return self.tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
 
-    def get_image_processor(self, **kwargs):
-        return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
-
     def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
         return self.rust_tokenizer_class.from_pretrained(self.tokenizer_pretrained_name, **kwargs)
 
     def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
         return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
 
+    def get_image_processor(self, **kwargs):
+        return LayoutLMv3ImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
@@ -151,7 +153,7 @@ def test_model_input_names(self):
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(images=image_input, text=input_str)
+        inputs = processor(text=input_str, images=image_input)
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
 
@@ -206,31 +208,6 @@ def preprocess_data(examples):
 
         self.assertEqual(len(train_data["pixel_values"]), len(train_data["input_ids"]))
 
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
-        inputs = processor(
-            images=image_input,
-            text=input_str,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="longest",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 5)
-
 
 # different use cases tests
 @require_sentencepiece
@@ -495,7 +472,7 @@ def test_processor_case_5(self):
             question = "What's his name?"
             words = ["hello", "world"]
             boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
-            input_processor = processor(images[0], question, text_pair=words, boxes=boxes, return_tensors="pt")
+            input_processor = processor(images[0], question, words, boxes, return_tensors="pt")
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
@@ -511,9 +488,7 @@ def test_processor_case_5(self):
             questions = ["How old is he?", "what's the time"]
             words = [["hello", "world"], ["my", "name", "is", "niels"]]
             boxes = [[[1, 2, 3, 4], [5, 6, 7, 8]], [[3, 2, 5, 1], [6, 7, 4, 2], [3, 9, 2, 4], [1, 1, 2, 3]]]
-            input_processor = processor(
-                images, questions, text_pair=words, boxes=boxes, padding=True, return_tensors="pt"
-            )
+            input_processor = processor(images, questions, words, boxes, padding=True, return_tensors="pt")
 
             # verify keys
             expected_keys = ["attention_mask", "bbox", "input_ids", "pixel_values"]
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 4bee79e3bdeb11..b8ca7a6d6733fe 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -18,31 +18,26 @@
 import json
 import tempfile
 
+
+try:
+    from typing import Unpack
+except ImportError:
+    from typing_extensions import Unpack
+
 import numpy as np
 
-from transformers import CLIPTokenizerFast, ProcessorMixin
 from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.testing_utils import (
     check_json_file_has_correct_format,
-    require_tokenizers,
     require_torch,
     require_vision,
 )
 from transformers.utils import is_vision_available
 
 
-try:
-    from typing import Unpack
-except ImportError:
-    from typing_extensions import Unpack
-import unittest
-
-
 if is_vision_available():
     from PIL import Image
 
-    from transformers import CLIPImageProcessor
-
 
 def prepare_image_inputs():
     """This function prepares a list of PIL images"""
@@ -55,9 +50,6 @@ def prepare_image_inputs():
 @require_vision
 class ProcessorTesterMixin:
     processor_class = None
-    text_data_arg_name = "input_ids"
-    images_data_arg_name = "pixel_values"
-    videos_data_arg_name = "pixel_values_videos"
 
     def prepare_processor_dict(self):
         return {}
@@ -137,42 +129,39 @@ def skip_processor_without_typed_kwargs(self, processor):
     def test_tokenizer_defaults_preserved_by_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 117)
+        self.assertEqual(len(inputs["input_ids"][0]), 117)
 
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor_components["image_processor"] = self.get_component(
-            "image_processor", size=(234, 234), crop_size=(234, 234)
-        )
-        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        image_processor = self.get_component("image_processor", size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 234)
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
 
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", padding="longest")
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
@@ -180,31 +169,30 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         inputs = processor(
             text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
         )
-        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 112)
+        self.assertEqual(len(inputs["input_ids"][0]), 112)
 
     def test_kwargs_overrides_default_image_processor_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor_components["image_processor"] = self.get_component("image_processor", size=(234, 234))
-        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+        image_processor = self.get_component("image_processor", size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
 
-        processor = self.processor_class(**processor_components)
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(
-            text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224), return_tensors="pt"
-        )
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 224)
+        inputs = processor(text=input_str, images=image_input, size=[224, 224])
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
 
     def test_unstructured_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
@@ -214,19 +202,20 @@ def test_unstructured_kwargs(self):
             images=image_input,
             return_tensors="pt",
             size={"height": 214, "width": 214},
-            crop_size={"height": 214, "width": 214},
             padding="max_length",
             max_length=76,
         )
 
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
-        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
 
     def test_unstructured_kwargs_batched(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = ["lower newer", "upper older longer string"]
@@ -236,19 +225,21 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             size={"height": 214, "width": 214},
-            crop_size={"height": 214, "width": 214},
             padding="longest",
             max_length=76,
         )
 
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), len(inputs[self.text_data_arg_name][1]))
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 6)
 
     def test_doubly_passed_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = ["lower newer"]
@@ -259,15 +250,15 @@ def test_doubly_passed_kwargs(self):
                 images=image_input,
                 images_kwargs={"size": {"height": 222, "width": 222}},
                 size={"height": 214, "width": 214},
-                crop_size={"height": 214, "width": 214},
-                return_tensors="pt",
             )
 
     def test_structured_kwargs_nested(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
@@ -276,84 +267,37 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {
-                "size": {"height": 214, "width": 214},
-                "crop_size": {"height": 214, "width": 214},
-            },
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
-        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
 
     def test_structured_kwargs_nested_from_dict(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        processor_components = self.prepare_components()
-        processor = self.processor_class(**processor_components)
-        self.skip_processor_without_typed_kwargs(processor)
 
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {
-                "size": {"height": 214, "width": 214},
-                "crop_size": {"height": 214, "width": 214},
-            },
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
-        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
-
-
-class MyProcessor(ProcessorMixin):
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "CLIPImageProcessor"
-    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
-
-    def __init__(self, image_processor=None, tokenizer=None, processor_attr_1=1, processor_attr_2=True):
-        super().__init__(image_processor, tokenizer)
-
-        self.processor_attr_1 = processor_attr_1
-        self.processor_attr_2 = processor_attr_2
-
-
-@require_tokenizers
-@require_vision
-class ProcessorTest(unittest.TestCase):
-    processor_class = MyProcessor
-
-    def prepare_processor_dict(self):
-        return {"processor_attr_1": 1, "processor_attr_2": False}
-
-    def get_processor(self):
-        image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
-        tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-large-patch14")
-        processor = MyProcessor(image_processor, tokenizer, **self.prepare_processor_dict())
-
-        return processor
-
-    def test_processor_to_json_string(self):
-        processor = self.get_processor()
-        obj = json.loads(processor.to_json_string())
-        for key, value in self.prepare_processor_dict().items():
-            self.assertEqual(obj[key], value)
-            self.assertEqual(getattr(processor, key, None), value)
-
-    def test_processor_from_and_save_pretrained(self):
-        processor_first = self.get_processor()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            saved_file = processor_first.save_pretrained(tmpdirname)[0]
-            check_json_file_has_correct_format(saved_file)
-            processor_second = self.processor_class.from_pretrained(tmpdirname)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
-        self.assertEqual(processor_second.to_dict(), processor_first.to_dict())
+        self.assertEqual(len(inputs["input_ids"][0]), 76)