From 508e1a47084ef0941a131dd3a49bf9540a1d1b35 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Fri, 16 Aug 2024 16:21:31 +0800
Subject: [PATCH 01/10] uniformize processor kwargs of nougat

---
 .../models/nougat/processing_nougat.py        | 180 +++++++++++-------
 1 file changed, 107 insertions(+), 73 deletions(-)

diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
index 8f94c6718ba660..ac7aef12d6199c 100644
--- a/src/transformers/models/nougat/processing_nougat.py
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -16,12 +16,52 @@
 Processor class for Nougat.
 """
 
-from typing import Dict, List, Optional, Union
+import sys
+import warnings
+from typing import List, Optional, Union
 
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput, TruncationStrategy
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
-from ...processing_utils import ProcessorMixin
-from ...utils import PaddingStrategy, TensorType
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class NougatTextKwargs(TextKwargs, total=False):
+    text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
+    text_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
+    text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
+
+
+class NougatImagesKwargs(ImagesKwargs, total=False):
+    do_crop_margin: Optional[bool]
+    do_thumbnail: Optional[bool]
+    do_align_long_axis: Optional[bool]
+
+
+class NougatProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: NougatTextKwargs
+    images_kwargs: NougatImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "is_split_into_words": False,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_length": False,
+            "verbose": True,
+        },
+        "images_kwargs": {
+            "data_format": "channels_first",
+        },
+    }
 
 
 class NougatProcessor(ProcessorMixin):
@@ -48,86 +88,80 @@ def __init__(self, image_processor, tokenizer):
 
     def __call__(
         self,
-        images=None,
-        text=None,
-        do_crop_margin: bool = None,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: "PILImageResampling" = None,  # noqa: F821
-        do_thumbnail: bool = None,
-        do_align_long_axis: bool = None,
-        do_pad: bool = None,
-        do_rescale: bool = None,
-        rescale_factor: Union[int, float] = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        data_format: Optional["ChannelDimension"] = "channels_first",  # noqa: F821
-        input_data_format: Optional[Union[str, "ChannelDimension"]] = None,  # noqa: F821
-        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
-        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        text_pair_target: Optional[
-            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
-        ] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        backwards_compatibility_placeholder_arg=None,
+        **kwargs: Unpack[NougatProcessorKwargs],
     ):
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
-        if images is not None:
-            inputs = self.image_processor(
-                images,
-                do_crop_margin=do_crop_margin,
-                do_resize=do_resize,
-                size=size,
-                resample=resample,
-                do_thumbnail=do_thumbnail,
-                do_align_long_axis=do_align_long_axis,
-                do_pad=do_pad,
-                do_rescale=do_rescale,
-                rescale_factor=rescale_factor,
-                do_normalize=do_normalize,
-                image_mean=image_mean,
-                image_std=image_std,
-                return_tensors=return_tensors,
-                data_format=data_format,
-                input_data_format=input_data_format,
+        output_kwargs = self._merge_kwargs(
+            NougatProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if output_kwargs["text_kwargs"].get("text_pair") is not None and audio is not None:
+            raise ValueError(
+                "You cannot provide `text_pair` as a positional argument and as a keyword argument at the same time."
+                "Please provide it only as a keyword argument (i.e. `text_pair=...`)."
+            )
+        if "text_pair" not in output_kwargs["text_kwargs"]:
+            warnings.warn(
+                "No `text_pair` kwarg was detected. The use of `text_pair` as an argument without specifying it explicitely as `text_pair=` will be deprecated in future versions."
             )
+            # For backwards compatibility, we reuse `audio` as `text_pair` in case
+            # downstream users passed it as a positional argument
+            if audio is not None:
+                output_kwargs["text_kwargs"]["text_pair"] = audio
+
+        if output_kwargs["text_kwargs"].get("text_target") is not None and videos is not None:
+            raise ValueError(
+                "You cannot provide `text_target` as a positional argument and as a keyword argument at the same time."
+                "Please provide it only as a keyword argument (i.e. `text_target=...`)."
+            )
+        if "text_target" not in output_kwargs["text_kwargs"]:
+            warnings.warn(
+                "No `text_target` kwarg was detected. The use of `text_target` as an argument without specifying it explicitely as `text_target=` will be deprecated in future versions."
+            )
+            # For backwards compatibility, we reuse `videos` as `text_target` in case
+            # downstream users passed it as a positional argument
+            if videos is not None:
+                output_kwargs["text_kwargs"]["text_target"] = videos
+
+        if (
+            output_kwargs["text_kwargs"].get("text_pair_target") is not None
+            and backwards_compatibility_placeholder_arg is not None
+        ):
+            raise ValueError(
+                "You cannot provide `text_pair_target` as a positional argument and as a keyword argument at the same time."
+                "Please provide it only as a keyword argument (i.e. `text_pair_target=...`)."
+            )
+        if "text_pair_target" not in output_kwargs["text_kwargs"]:
+            warnings.warn(
+                "No `text_pair_target` kwarg was detected. The use of `text_pair_target` as an argument without specifying it explicitely as `text_pair_target=` will be deprecated in future versions."
+            )
+            # For backwards compatibility, we reuse `backwards_compatibility_placeholder_arg` as `text_pair_target` in case
+            # downstream users passed it as a positional argument
+            if backwards_compatibility_placeholder_arg is not None:
+                output_kwargs["text_kwargs"]["text_pair_target"] = backwards_compatibility_placeholder_arg
+
+        text_pair = output_kwargs["text_kwargs"].pop("text_pair", None)
+        text_target = output_kwargs["text_kwargs"].pop("text_target", None)
+        text_pair_target = output_kwargs["text_kwargs"].pop("text_pair_target", None)
+
+        if images is not None:
+            inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         if text is not None:
             encodings = self.tokenizer(
                 text,
                 text_pair=text_pair,
                 text_target=text_target,
                 text_pair_target=text_pair_target,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                is_split_into_words=is_split_into_words,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
+                **output_kwargs["text_kwargs"],
             )
 
         if text is None:

From 257c690fe4cfb71cdeb4c2c3ec53db2176030343 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Fri, 16 Aug 2024 18:30:21 +0800
Subject: [PATCH 02/10] add tests and more docs

---
 .../models/donut/image_processing_donut.py    |  2 ++
 .../models/fuyu/image_processing_fuyu.py      |  3 ++-
 .../models/nougat/image_processing_nougat.py  |  2 ++
 .../models/nougat/processing_nougat.py        | 14 +++++-----
 .../models/sam/image_processing_sam.py        |  1 +
 .../models/tvp/image_processing_tvp.py        |  1 +
 tests/models/nougat/test_processor_nougat.py  | 17 ++++++++++++
 tests/test_processing_common.py               | 26 ++++++++++---------
 8 files changed, 47 insertions(+), 19 deletions(-)
 create mode 100644 tests/models/nougat/test_processor_nougat.py

diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index edb0629d44bd04..b56db329420460 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -183,6 +183,7 @@ def pad_image(
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
+        size = get_size_dict(size)
         output_height, output_width = size["height"], size["width"]
         input_height, input_width = get_image_size(image, channel_dim=input_data_format)
 
@@ -232,6 +233,7 @@ def thumbnail(
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
         input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        size = get_size_dict(size)
         output_height, output_width = size["height"], size["width"]
 
         # We always resize to the smallest of either the input or output size.
diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py
index 255922b8308889..19eb1d0e7e3e77 100644
--- a/src/transformers/models/fuyu/image_processing_fuyu.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 
-from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
     pad,
     resize,
@@ -344,6 +344,7 @@ def pad_image(
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
         image_height, image_width = get_image_size(image, input_data_format)
+        size = get_size_dict(size)
         target_height, target_width = size["height"], size["width"]
         padding_top = 0
         padding_left = 0
diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py
index 792f4a14325a0a..ff8090964e26ef 100644
--- a/src/transformers/models/nougat/image_processing_nougat.py
+++ b/src/transformers/models/nougat/image_processing_nougat.py
@@ -250,6 +250,7 @@ def pad_image(
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
+        size = get_size_dict(size)
         output_height, output_width = size["height"], size["width"]
         input_height, input_width = get_image_size(image, channel_dim=input_data_format)
 
@@ -292,6 +293,7 @@ def thumbnail(
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
         input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        size = get_size_dict(size)
         output_height, output_width = size["height"], size["width"]
 
         # We always resize to the smallest of either the input or output size.
diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
index ac7aef12d6199c..f63fcb4082f5f5 100644
--- a/src/transformers/models/nougat/processing_nougat.py
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -103,7 +103,11 @@ def __call__(
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+        # Temporary fix for "paddding_side" in init_kwargs
+        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
 
+        # For backwards compatibility, we reuse `audio` as `text_pair`
+        # in case downstream users passed it as a positional argument
         if output_kwargs["text_kwargs"].get("text_pair") is not None and audio is not None:
             raise ValueError(
                 "You cannot provide `text_pair` as a positional argument and as a keyword argument at the same time."
@@ -113,11 +117,11 @@ def __call__(
             warnings.warn(
                 "No `text_pair` kwarg was detected. The use of `text_pair` as an argument without specifying it explicitely as `text_pair=` will be deprecated in future versions."
             )
-            # For backwards compatibility, we reuse `audio` as `text_pair` in case
-            # downstream users passed it as a positional argument
             if audio is not None:
                 output_kwargs["text_kwargs"]["text_pair"] = audio
 
+        # For backwards compatibility, we reuse `videos` as `text_target`
+        # in case downstream users passed it as a positional argument
         if output_kwargs["text_kwargs"].get("text_target") is not None and videos is not None:
             raise ValueError(
                 "You cannot provide `text_target` as a positional argument and as a keyword argument at the same time."
@@ -127,11 +131,11 @@ def __call__(
             warnings.warn(
                 "No `text_target` kwarg was detected. The use of `text_target` as an argument without specifying it explicitely as `text_target=` will be deprecated in future versions."
             )
-            # For backwards compatibility, we reuse `videos` as `text_target` in case
-            # downstream users passed it as a positional argument
             if videos is not None:
                 output_kwargs["text_kwargs"]["text_target"] = videos
 
+        # For backwards compatibility, we reuse `backwards_compatibility_placeholder_arg` as `text_pair_target`
+        # in case downstream users passed it as a positional argument
         if (
             output_kwargs["text_kwargs"].get("text_pair_target") is not None
             and backwards_compatibility_placeholder_arg is not None
@@ -144,8 +148,6 @@ def __call__(
             warnings.warn(
                 "No `text_pair_target` kwarg was detected. The use of `text_pair_target` as an argument without specifying it explicitely as `text_pair_target=` will be deprecated in future versions."
             )
-            # For backwards compatibility, we reuse `backwards_compatibility_placeholder_arg` as `text_pair_target` in case
-            # downstream users passed it as a positional argument
             if backwards_compatibility_placeholder_arg is not None:
                 output_kwargs["text_kwargs"]["text_pair_target"] = backwards_compatibility_placeholder_arg
 
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index beea3f4b01c311..ff86fdeb577e08 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -185,6 +185,7 @@ def pad_image(
             input_data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
+        pad_size = get_size_dict(pad_size)
         output_height, output_width = pad_size["height"], pad_size["width"]
         input_height, input_width = get_image_size(image, channel_dim=input_data_format)
 
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 100ec133e8b026..dfa4902c9a442b 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -244,6 +244,7 @@ def pad_image(
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
         height, width = get_image_size(image, channel_dim=input_data_format)
+        pad_size = get_size_dict(pad_size)
         max_height = pad_size.get("height", height)
         max_width = pad_size.get("width", width)
 
diff --git a/tests/models/nougat/test_processor_nougat.py b/tests/models/nougat/test_processor_nougat.py
new file mode 100644
index 00000000000000..ca512684cc68bd
--- /dev/null
+++ b/tests/models/nougat/test_processor_nougat.py
@@ -0,0 +1,17 @@
+import tempfile
+import unittest
+
+from transformers import NougatProcessor
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+class NougatProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "facebook/nougat-base"
+    text_data_arg_name = "labels"
+    processor_class = NougatProcessor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index a30c6363b9d7ff..577341fe531b6e 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -48,6 +48,8 @@
 @require_vision
 @require_torch
 class ProcessorTesterMixin:
+    image_data_arg_name = "pixel_values"
+    text_data_arg_name = "input_ids"
     processor_class = None
 
     def prepare_processor_dict(self):
@@ -136,7 +138,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs["input_ids"][0]), 117)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 117)
 
     @require_torch
     @require_vision
@@ -153,7 +155,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
+        self.assertEqual(len(inputs[self.image_data_arg_name][0][0]), 234)
 
     @require_vision
     @require_torch
@@ -171,7 +173,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         inputs = processor(
             text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
         )
-        self.assertEqual(len(inputs["input_ids"][0]), 112)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 112)
 
     @require_torch
     @require_vision
@@ -188,7 +190,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, size=[224, 224])
-        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
+        self.assertEqual(len(inputs[self.image_data_arg_name][0][0]), 224)
 
     @require_torch
     @require_vision
@@ -212,8 +214,8 @@ def test_unstructured_kwargs(self):
             max_length=76,
         )
 
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
+        self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
 
     @require_torch
     @require_vision
@@ -237,9 +239,9 @@ def test_unstructured_kwargs_batched(self):
             max_length=76,
         )
 
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214)
 
-        self.assertEqual(len(inputs["input_ids"][0]), 6)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 6)
 
     @require_torch
     @require_vision
@@ -286,9 +288,9 @@ def test_structured_kwargs_nested(self):
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214)
 
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
 
     @require_torch
     @require_vision
@@ -312,9 +314,9 @@ def test_structured_kwargs_nested_from_dict(self):
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214)
 
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
 
 
 class MyProcessor(ProcessorMixin):

From 93e70702a952d546a08d15ed52a67ac33114e083 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Sat, 17 Aug 2024 14:17:29 +0800
Subject: [PATCH 03/10] add uniformization of processor kwargs of processors
 with special keys here

---
 .../models/clipseg/processing_clipseg.py      | 72 +++++++++++++----
 .../models/owlv2/image_processing_owlv2.py    |  3 +-
 .../models/owlv2/processing_owlv2.py          | 80 +++++++++++++++----
 .../models/owlvit/processing_owlvit.py        | 79 ++++++++++++++----
 .../models/clipseg/test_processor_clipseg.py  | 10 ++-
 tests/models/owlv2/test_processor_owlv2.py    | 18 +++++
 tests/models/owlvit/test_processor_owlvit.py  | 10 ++-
 7 files changed, 217 insertions(+), 55 deletions(-)
 create mode 100644 tests/models/owlv2/test_processor_owlv2.py

diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index f8eaca82334a22..bbec55fabf99f7 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -16,10 +16,28 @@
 Image/Text processor class for CLIPSeg
 """
 
+import sys
 import warnings
+from typing import List, Optional, Union
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class CLIPSegImagesKwargs(ImagesKwargs, total=False):
+    visual_prompt: Optional[ImageInput]
+
+
+class CLIPSegProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: CLIPSegImagesKwargs
+    _defaults = {}
 
 
 class CLIPSegProcessor(ProcessorMixin):
@@ -58,7 +76,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
 
         super().__init__(image_processor, tokenizer)
 
-    def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=None, **kwargs):
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        images: Optional[ImageInput] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[CLIPSegProcessorKwargs],
+    ):
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -79,14 +104,6 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No
                 NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape
                 (C, H, W), where C is a number of channels, H and W are image height and width.
 
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
@@ -96,6 +113,29 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+
+        output_kwargs = self._merge_kwargs(
+            CLIPSegProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if output_kwargs["images_kwargs"].get("visual_prompt") is not None and audio is not None:
+            raise ValueError(
+                "You cannot provide `visual_prompt` as a positional argument and as a keyword argument at the same time."
+                "Please provide it only as a keyword argument (i.e. `visual_prompt=...`)."
+            )
+        if "visual_prompt" not in output_kwargs["images_kwargs"]:
+            warnings.warn(
+                "No `visual_prompt` kwarg was detected. The use of `visual_prompt` as an argument without specifying it explicitely as `visual_prompt=` will be deprecated in future versions."
+            )
+            # For backwards compatibility, we reuse `audio` as `visual_prompt` in case
+            # downstream users passed it as a positional argument
+            if audio is not None:
+                output_kwargs["images_kwargs"]["visual_prompt"] = audio
+
+        visual_prompt = output_kwargs["images_kwargs"].pop("visual_prompt", None)
+
         if text is None and visual_prompt is None and images is None:
             raise ValueError("You have to specify either text, visual prompt or images.")
 
@@ -103,13 +143,13 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No
             raise ValueError("You have to specify exactly one type of prompt. Either text or visual prompt.")
 
         if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if visual_prompt is not None:
-            prompt_features = self.image_processor(visual_prompt, return_tensors=return_tensors, **kwargs)
+            prompt_features = self.image_processor(visual_prompt, **output_kwargs["images_kwargs"])
 
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
         if visual_prompt is not None and images is not None:
             encoding = {
@@ -128,7 +168,9 @@ def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=No
             }
             return encoding
         else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+            return BatchEncoding(
+                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"].get("return_tensors")
+            )
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/owlv2/image_processing_owlv2.py b/src/transformers/models/owlv2/image_processing_owlv2.py
index dd32dc9f141183..eca806af296316 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 
-from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
     center_to_corners_format,
     pad,
@@ -296,6 +296,7 @@ def resize(
         """
         requires_backends(self, "scipy")
 
+        size = get_size_dict(size)
         output_shape = (size["height"], size["width"])
         image = to_channel_dimension_format(image, ChannelDimension.LAST)
         image, output_shape = _preprocess_resize_output_shape(image, output_shape)
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 8b580ca5026618..dc8fefd434762b 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -16,15 +16,40 @@
 Image/Text processor class for OWLv2
 """
 
-from typing import List
+import sys
+import warnings
+from typing import List, Optional, Union
 
 import numpy as np
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import is_flax_available, is_tf_available, is_torch_available
 
 
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class Owlv2ImagesKwargs(ImagesKwargs, total=False):
+    query_images: Optional[ImageInput]
+
+
+class Owlv2ProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Owlv2ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": "max_length",
+        },
+        "common_kwargs": {
+            "return_tensors": "np",
+        },
+    }
+
+
 class Owlv2Processor(ProcessorMixin):
     r"""
     Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into
@@ -45,8 +70,14 @@ class Owlv2Processor(ProcessorMixin):
     def __init__(self, image_processor, tokenizer, **kwargs):
         super().__init__(image_processor, tokenizer)
 
-    # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OWLViT->OWLv2
-    def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        images: Optional[ImageInput] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[Owlv2ProcessorKwargs],
+    ):
         """
         Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
         `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
@@ -67,12 +98,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt
                 The query image to be prepared, one query image is expected per target image to be queried. Each image
                 can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
                 should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
@@ -81,6 +107,28 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+        output_kwargs = self._merge_kwargs(
+            Owlv2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if output_kwargs["images_kwargs"].get("query_images") is not None and audio is not None:
+            raise ValueError(
+                "You cannot provide `query_images` as a positional argument and as a keyword argument at the same time."
+                "Please provide it only as a keyword argument (i.e. `query_images=...`)."
+            )
+        if "query_images" not in output_kwargs["images_kwargs"]:
+            warnings.warn(
+                "No `query_images` kwarg was detected. The use of `query_images` as an argument without specifying it explicitely as `query_images=` will be deprecated in future versions."
+            )
+            # For backwards compatibility, we reuse `audio` as `query_images` in case
+            # downstream users passed it as a positional argument
+            if audio is not None:
+                output_kwargs["images_kwargs"]["query_images"] = audio
+
+        query_images = output_kwargs["images_kwargs"].pop("query_images", None)
+        return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
 
         if text is None and query_images is None and images is None:
             raise ValueError(
@@ -89,7 +137,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt
 
         if text is not None:
             if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)):
-                encodings = [self.tokenizer(text, padding=padding, return_tensors=return_tensors, **kwargs)]
+                encodings = [self.tokenizer(text, **output_kwargs["text_kwargs"])]
 
             elif isinstance(text, List) and isinstance(text[0], List):
                 encodings = []
@@ -102,7 +150,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt
                     if len(t) != max_num_queries:
                         t = t + [" "] * (max_num_queries - len(t))
 
-                    encoding = self.tokenizer(t, padding=padding, return_tensors=return_tensors, **kwargs)
+                    encoding = self.tokenizer(t, **output_kwargs["text_kwargs"])
                     encodings.append(encoding)
             else:
                 raise TypeError("Input text should be a string, a list of strings or a nested list of strings")
@@ -138,13 +186,11 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt
 
         if query_images is not None:
             encoding = BatchEncoding()
-            query_pixel_values = self.image_processor(
-                query_images, return_tensors=return_tensors, **kwargs
-            ).pixel_values
+            query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values
             encoding["query_pixel_values"] = query_pixel_values
 
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
         if text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 2c7d490104bdfc..d6f8389b94c4b9 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -16,16 +16,40 @@
 Image/Text processor class for OWL-ViT
 """
 
+import sys
 import warnings
-from typing import List
+from typing import List, Optional, Union
 
 import numpy as np
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import is_flax_available, is_tf_available, is_torch_available
 
 
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
+
+class OwlViTImagesKwargs(ImagesKwargs, total=False):
+    query_images: Optional[ImageInput]
+
+
+class OwlViTProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: OwlViTImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": "max_length",
+        },
+        "common_kwargs": {
+            "return_tensors": "np",
+        },
+    }
+
+
 class OwlViTProcessor(ProcessorMixin):
     r"""
     Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
@@ -61,7 +85,14 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
 
         super().__init__(image_processor, tokenizer)
 
-    def __call__(self, text=None, images=None, query_images=None, padding="max_length", return_tensors="np", **kwargs):
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        images: Optional[ImageInput] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[OwlViTProcessorKwargs],
+    ):
         """
         Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
         `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
@@ -82,12 +113,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt
                 The query image to be prepared, one query image is expected per target image to be queried. Each image
                 can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
                 should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
@@ -97,6 +123,29 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
 
+        output_kwargs = self._merge_kwargs(
+            OwlViTProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if output_kwargs["images_kwargs"].get("query_images") is not None and audio is not None:
+            raise ValueError(
+                "You cannot provide `query_images` as a positional argument and as a keyword argument at the same time."
+                "Please provide it only as a keyword argument (i.e. `query_images=...`)."
+            )
+        if "query_images" not in output_kwargs["images_kwargs"]:
+            warnings.warn(
+                "No `query_images` kwarg was detected. The use of `query_images` as an argument without specifying it explicitely as `query_images=` will be deprecated in future versions."
+            )
+            # For backwards compatibility, we reuse `audio` as `query_images` in case
+            # downstream users passed it as a positional argument
+            if audio is not None:
+                output_kwargs["images_kwargs"]["query_images"] = audio
+
+        query_images = output_kwargs["images_kwargs"].pop("query_images", None)
+        return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
+
         if text is None and query_images is None and images is None:
             raise ValueError(
                 "You have to specify at least one text or query image or image. All three cannot be none."
@@ -104,7 +153,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt
 
         if text is not None:
             if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)):
-                encodings = [self.tokenizer(text, padding=padding, return_tensors=return_tensors, **kwargs)]
+                encodings = [self.tokenizer(text, **output_kwargs["text_kwargs"])]
 
             elif isinstance(text, List) and isinstance(text[0], List):
                 encodings = []
@@ -117,7 +166,7 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt
                     if len(t) != max_num_queries:
                         t = t + [" "] * (max_num_queries - len(t))
 
-                    encoding = self.tokenizer(t, padding=padding, return_tensors=return_tensors, **kwargs)
+                    encoding = self.tokenizer(t, **output_kwargs["text_kwargs"])
                     encodings.append(encoding)
             else:
                 raise TypeError("Input text should be a string, a list of strings or a nested list of strings")
@@ -153,13 +202,11 @@ def __call__(self, text=None, images=None, query_images=None, padding="max_lengt
 
         if query_images is not None:
             encoding = BatchEncoding()
-            query_pixel_values = self.image_processor(
-                query_images, return_tensors=return_tensors, **kwargs
-            ).pixel_values
+            query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values
             encoding["query_pixel_values"] = query_pixel_values
 
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
 
         if text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values
diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py
index e33049b2768fe4..bfc7d59c1b0ff3 100644
--- a/tests/models/clipseg/test_processor_clipseg.py
+++ b/tests/models/clipseg/test_processor_clipseg.py
@@ -21,20 +21,24 @@
 import numpy as np
 import pytest
 
-from transformers import CLIPTokenizer, CLIPTokenizerFast
+from transformers import CLIPSegProcessor, CLIPTokenizer, CLIPTokenizerFast
 from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import CLIPSegProcessor, ViTImageProcessor
+    from transformers import ViTImageProcessor
 
 
 @require_vision
-class CLIPSegProcessorTest(unittest.TestCase):
+class CLIPSegProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = CLIPSegProcessor
+
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
diff --git a/tests/models/owlv2/test_processor_owlv2.py b/tests/models/owlv2/test_processor_owlv2.py
new file mode 100644
index 00000000000000..b8f8b5d26cfe41
--- /dev/null
+++ b/tests/models/owlv2/test_processor_owlv2.py
@@ -0,0 +1,18 @@
+import tempfile
+import unittest
+
+from transformers import Owlv2Processor
+from transformers.testing_utils import require_scipy
+
+from ...test_processing_common import ProcessorTesterMixin
+
+
+@require_scipy
+class Owlv2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "google/owlv2-base-patch16-ensemble"
+    processor_class = Owlv2Processor
+
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        processor = self.processor_class.from_pretrained(self.from_pretrained_id)
+        processor.save_pretrained(self.tmpdirname)
diff --git a/tests/models/owlvit/test_processor_owlvit.py b/tests/models/owlvit/test_processor_owlvit.py
index b271c8880bfddc..3fadfac0046002 100644
--- a/tests/models/owlvit/test_processor_owlvit.py
+++ b/tests/models/owlvit/test_processor_owlvit.py
@@ -21,20 +21,24 @@
 import numpy as np
 import pytest
 
-from transformers import CLIPTokenizer, CLIPTokenizerFast
+from transformers import CLIPTokenizer, CLIPTokenizerFast, OwlViTProcessor
 from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import OwlViTImageProcessor, OwlViTProcessor
+    from transformers import OwlViTImageProcessor
 
 
 @require_vision
-class OwlViTProcessorTest(unittest.TestCase):
+class OwlViTProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = OwlViTProcessor
+
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 

From 0128d199daaf90cb196ffcf82256b6df8d3364f3 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Sat, 17 Aug 2024 17:39:46 +0800
Subject: [PATCH 04/10] refactor how we handle arguments passed as positional
 args

---
 .../models/clipseg/processing_clipseg.py      |  28 +---
 .../models/nougat/processing_nougat.py        |  68 +--------
 .../models/owlv2/processing_owlv2.py          |  28 ++--
 .../models/owlvit/processing_owlvit.py        |  27 ++--
 src/transformers/processing_utils.py          |  62 ++++++++
 .../models/clipseg/test_processor_clipseg.py  |  17 +++
 tests/test_processing_common.py               | 141 +++++++++++-------
 7 files changed, 197 insertions(+), 174 deletions(-)

diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index bbec55fabf99f7..7219fc7d4831ea 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -21,7 +21,7 @@
 from typing import List, Optional, Union
 
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 
 
@@ -31,12 +31,7 @@
     from typing_extensions import Unpack
 
 
-class CLIPSegImagesKwargs(ImagesKwargs, total=False):
-    visual_prompt: Optional[ImageInput]
-
-
 class CLIPSegProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: CLIPSegImagesKwargs
     _defaults = {}
 
 
@@ -57,6 +52,8 @@ class CLIPSegProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "ViTImageProcessor"
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+    # For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
+    optional_call_args = ["visual_prompt"]
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         feature_extractor = None
@@ -80,6 +77,10 @@ def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
         images: Optional[ImageInput] = None,
+        # The following is to capture `visual_prompt` argument that may be passed as a positional argument.
+        # See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
+        # This behavior is only needed for backward compatibility and will be removed in future versions.
+        *args,
         audio=None,
         videos=None,
         **kwargs: Unpack[CLIPSegProcessorKwargs],
@@ -118,22 +119,9 @@ def __call__(
             CLIPSegProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
+            **self.prepare_and_validate_optional_call_args(*args),
         )
 
-        if output_kwargs["images_kwargs"].get("visual_prompt") is not None and audio is not None:
-            raise ValueError(
-                "You cannot provide `visual_prompt` as a positional argument and as a keyword argument at the same time."
-                "Please provide it only as a keyword argument (i.e. `visual_prompt=...`)."
-            )
-        if "visual_prompt" not in output_kwargs["images_kwargs"]:
-            warnings.warn(
-                "No `visual_prompt` kwarg was detected. The use of `visual_prompt` as an argument without specifying it explicitely as `visual_prompt=` will be deprecated in future versions."
-            )
-            # For backwards compatibility, we reuse `audio` as `visual_prompt` in case
-            # downstream users passed it as a positional argument
-            if audio is not None:
-                output_kwargs["images_kwargs"]["visual_prompt"] = audio
-
         visual_prompt = output_kwargs["images_kwargs"].pop("visual_prompt", None)
 
         if text is None and visual_prompt is None and images is None:
diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
index f63fcb4082f5f5..cbb43449dbe355 100644
--- a/src/transformers/models/nougat/processing_nougat.py
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -17,11 +17,10 @@
 """
 
 import sys
-import warnings
 from typing import List, Optional, Union
 
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
@@ -31,12 +30,6 @@
     from typing_extensions import Unpack
 
 
-class NougatTextKwargs(TextKwargs, total=False):
-    text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
-    text_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
-    text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
-
-
 class NougatImagesKwargs(ImagesKwargs, total=False):
     do_crop_margin: Optional[bool]
     do_thumbnail: Optional[bool]
@@ -44,7 +37,6 @@ class NougatImagesKwargs(ImagesKwargs, total=False):
 
 
 class NougatProcessorKwargs(ProcessingKwargs, total=False):
-    text_kwargs: NougatTextKwargs
     images_kwargs: NougatImagesKwargs
     _defaults = {
         "text_kwargs": {
@@ -92,7 +84,6 @@ def __call__(
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
         audio=None,
         videos=None,
-        backwards_compatibility_placeholder_arg=None,
         **kwargs: Unpack[NougatProcessorKwargs],
     ):
         if images is None and text is None:
@@ -106,65 +97,10 @@ def __call__(
         # Temporary fix for "paddding_side" in init_kwargs
         _ = output_kwargs["text_kwargs"].pop("padding_side", None)
 
-        # For backwards compatibility, we reuse `audio` as `text_pair`
-        # in case downstream users passed it as a positional argument
-        if output_kwargs["text_kwargs"].get("text_pair") is not None and audio is not None:
-            raise ValueError(
-                "You cannot provide `text_pair` as a positional argument and as a keyword argument at the same time."
-                "Please provide it only as a keyword argument (i.e. `text_pair=...`)."
-            )
-        if "text_pair" not in output_kwargs["text_kwargs"]:
-            warnings.warn(
-                "No `text_pair` kwarg was detected. The use of `text_pair` as an argument without specifying it explicitely as `text_pair=` will be deprecated in future versions."
-            )
-            if audio is not None:
-                output_kwargs["text_kwargs"]["text_pair"] = audio
-
-        # For backwards compatibility, we reuse `videos` as `text_target`
-        # in case downstream users passed it as a positional argument
-        if output_kwargs["text_kwargs"].get("text_target") is not None and videos is not None:
-            raise ValueError(
-                "You cannot provide `text_target` as a positional argument and as a keyword argument at the same time."
-                "Please provide it only as a keyword argument (i.e. `text_target=...`)."
-            )
-        if "text_target" not in output_kwargs["text_kwargs"]:
-            warnings.warn(
-                "No `text_target` kwarg was detected. The use of `text_target` as an argument without specifying it explicitely as `text_target=` will be deprecated in future versions."
-            )
-            if videos is not None:
-                output_kwargs["text_kwargs"]["text_target"] = videos
-
-        # For backwards compatibility, we reuse `backwards_compatibility_placeholder_arg` as `text_pair_target`
-        # in case downstream users passed it as a positional argument
-        if (
-            output_kwargs["text_kwargs"].get("text_pair_target") is not None
-            and backwards_compatibility_placeholder_arg is not None
-        ):
-            raise ValueError(
-                "You cannot provide `text_pair_target` as a positional argument and as a keyword argument at the same time."
-                "Please provide it only as a keyword argument (i.e. `text_pair_target=...`)."
-            )
-        if "text_pair_target" not in output_kwargs["text_kwargs"]:
-            warnings.warn(
-                "No `text_pair_target` kwarg was detected. The use of `text_pair_target` as an argument without specifying it explicitely as `text_pair_target=` will be deprecated in future versions."
-            )
-            if backwards_compatibility_placeholder_arg is not None:
-                output_kwargs["text_kwargs"]["text_pair_target"] = backwards_compatibility_placeholder_arg
-
-        text_pair = output_kwargs["text_kwargs"].pop("text_pair", None)
-        text_target = output_kwargs["text_kwargs"].pop("text_target", None)
-        text_pair_target = output_kwargs["text_kwargs"].pop("text_pair_target", None)
-
         if images is not None:
             inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         if text is not None:
-            encodings = self.tokenizer(
-                text,
-                text_pair=text_pair,
-                text_target=text_target,
-                text_pair_target=text_pair_target,
-                **output_kwargs["text_kwargs"],
-            )
+            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
         if text is None:
             return inputs
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index dc8fefd434762b..36af9c29c69da7 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -17,7 +17,6 @@
 """
 
 import sys
-import warnings
 from typing import List, Optional, Union
 
 import numpy as np
@@ -66,6 +65,8 @@ class Owlv2Processor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "Owlv2ImageProcessor"
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+    # For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
+    optional_call_args = ["query_images"]
 
     def __init__(self, image_processor, tokenizer, **kwargs):
         super().__init__(image_processor, tokenizer)
@@ -74,6 +75,10 @@ def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
         images: Optional[ImageInput] = None,
+        # The following is to capture `visual_prompt` argument that may be passed as a positional argument.
+        # See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
+        # This behavior is only needed for backward compatibility and will be removed in future versions.
+        *args,
         audio=None,
         videos=None,
         **kwargs: Unpack[Owlv2ProcessorKwargs],
@@ -86,15 +91,15 @@ def __call__(
         of the above two methods for more information.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
-            `List[torch.Tensor]`):
+            `List[torch.Tensor]`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
-            query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
                 The query image to be prepared, one query image is expected per target image to be queried. Each image
                 can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
                 should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
@@ -111,22 +116,9 @@ def __call__(
             Owlv2ProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
+            **self.prepare_and_validate_optional_call_args(*args),
         )
 
-        if output_kwargs["images_kwargs"].get("query_images") is not None and audio is not None:
-            raise ValueError(
-                "You cannot provide `query_images` as a positional argument and as a keyword argument at the same time."
-                "Please provide it only as a keyword argument (i.e. `query_images=...`)."
-            )
-        if "query_images" not in output_kwargs["images_kwargs"]:
-            warnings.warn(
-                "No `query_images` kwarg was detected. The use of `query_images` as an argument without specifying it explicitely as `query_images=` will be deprecated in future versions."
-            )
-            # For backwards compatibility, we reuse `audio` as `query_images` in case
-            # downstream users passed it as a positional argument
-            if audio is not None:
-                output_kwargs["images_kwargs"]["query_images"] = audio
-
         query_images = output_kwargs["images_kwargs"].pop("query_images", None)
         return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
 
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index d6f8389b94c4b9..fe6dcc96907d40 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -66,6 +66,8 @@ class OwlViTProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "OwlViTImageProcessor"
     tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+    # For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
+    optional_call_args = ["query_images"]
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         feature_extractor = None
@@ -89,6 +91,10 @@ def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
         images: Optional[ImageInput] = None,
+        # The following is to capture `visual_prompt` argument that may be passed as a positional argument.
+        # See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
+        # This behavior is only needed for backward compatibility and will be removed in future versions.
+        *args,
         audio=None,
         videos=None,
         **kwargs: Unpack[OwlViTProcessorKwargs],
@@ -101,15 +107,15 @@ def __call__(
         of the above two methods for more information.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
-            `List[torch.Tensor]`):
+            `List[torch.Tensor]`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
-            query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+            query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
                 The query image to be prepared, one query image is expected per target image to be queried. Each image
                 can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
                 should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
@@ -127,22 +133,9 @@ def __call__(
             OwlViTProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
+            **self.prepare_and_validate_optional_call_args(*args),
         )
 
-        if output_kwargs["images_kwargs"].get("query_images") is not None and audio is not None:
-            raise ValueError(
-                "You cannot provide `query_images` as a positional argument and as a keyword argument at the same time."
-                "Please provide it only as a keyword argument (i.e. `query_images=...`)."
-            )
-        if "query_images" not in output_kwargs["images_kwargs"]:
-            warnings.warn(
-                "No `query_images` kwarg was detected. The use of `query_images` as an argument without specifying it explicitely as `query_images=` will be deprecated in future versions."
-            )
-            # For backwards compatibility, we reuse `audio` as `query_images` in case
-            # downstream users passed it as a positional argument
-            if audio is not None:
-                output_kwargs["images_kwargs"]["query_images"] = audio
-
         query_images = output_kwargs["images_kwargs"].pop("query_images", None)
         return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 09f62481956e77..98d0ad5f5bb70b 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -35,7 +35,9 @@
 
 from .tokenization_utils_base import (
     PaddingStrategy,
+    PreTokenizedInput,
     PreTrainedTokenizerBase,
+    TextInput,
     TruncationStrategy,
 )
 from .utils import (
@@ -106,6 +108,9 @@ class TextKwargs(TypedDict, total=False):
             The side on which padding will be applied.
     """
 
+    text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
+    text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
+    text_pair_target: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]]
     add_special_tokens: Optional[bool]
     padding: Union[bool, str, PaddingStrategy]
     truncation: Union[bool, str, TruncationStrategy]
@@ -317,6 +322,7 @@ class ProcessorMixin(PushToHubMixin):
 
     attributes = ["feature_extractor", "tokenizer"]
     optional_attributes = ["chat_template"]
+    optional_call_args: List[str] = []
     # Names need to be attr_class for attr in attributes
     feature_extractor_class = None
     tokenizer_class = None
@@ -956,6 +962,62 @@ def validate_init_kwargs(processor_config, valid_kwargs):
             unused_kwargs = {k: processor_config[k] for k in unused_keys}
         return unused_kwargs
 
+    def prepare_and_validate_optional_call_args(self, *args):
+        """
+        Matches optional positional arguments to their corresponding names in `optional_call_args`
+        in the processor class in the order they are passed to the processor call.
+
+        Note that this should only be used in the `__call__` method of the processors with special
+        arguments. Special arguments are arguments that aren't `text`, `images`, `audio`, nor `videos`
+        but also aren't passed to the tokenizer, image processor, etc. Examples of such processors are:
+            - `CLIPSegProcessor`
+            - `LayoutLMv2Processor`
+            - `OwlViTProcessor`
+
+        Also note that passing by position to the processor call is now deprecated and will be disallowed
+        in future versions. We only have this for backward compatibility.
+
+        Example:
+            Suppose that the processor class has `optional_call_args = ["arg_name_1", "arg_name_2"]`.
+            And we define the call method as:
+            ```python
+            def __call__(
+                self,
+                text: str,
+                images: Optional[ImageInput] = None,
+                *arg,
+                audio=None,
+                videos=None,
+            )
+            ```
+
+            Then, if we call the processor as:
+            ```python
+            images = [...]
+            processor("What is common in these images?", images, "arg_value_1", "arg_value_2")
+            ```
+
+            Then, this method will return:
+            ```python
+            {
+                "arg_name_1": "arg_value_1",
+                "arg_name_2": "arg_value_2",
+            }
+            ```
+            which we could then pass as kwargs to `self._merge_kwargs`
+        """
+        if len(args):
+            warnings.warn(
+                "Passing positional arguments to the processor call is now deprecated and will be disallowed in future versions. "
+                "Please pass all arguments as keyword arguments."
+            )
+        if len(args) > len(self.optional_call_args):
+            raise ValueError(
+                f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call but received {len(args)}."
+                "Passing positional arguments to the processor call is not recommended"
+            )
+        return {arg_name: arg_value for arg_value, arg_name in zip(args, self.optional_call_args)}
+
     def apply_chat_template(
         self,
         conversation: Union[List[Dict[str, str]]],
diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py
index bfc7d59c1b0ff3..a535219a23fb0c 100644
--- a/tests/models/clipseg/test_processor_clipseg.py
+++ b/tests/models/clipseg/test_processor_clipseg.py
@@ -193,6 +193,23 @@ def test_processor_visual_prompt(self):
         with pytest.raises(ValueError):
             processor()
 
+    def test_processor_visual_prompt_positional(self):
+        image_processor = self.get_image_processor()
+        tokenizer = self.get_tokenizer()
+
+        processor = CLIPSegProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+        image_input = self.prepare_image_inputs()
+        visual_prompt_input = self.prepare_image_inputs()
+
+        inputs = processor(None, image_input, visual_prompt_input)
+
+        self.assertListEqual(list(inputs.keys()), ["pixel_values", "conditional_pixel_values"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
+
     def test_tokenizer_decode(self):
         image_processor = self.get_image_processor()
         tokenizer = self.get_tokenizer()
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 577341fe531b6e..bad0eb8cd6b72b 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -26,6 +26,7 @@
 import unittest
 
 import numpy as np
+from huggingface_hub import hf_hub_download
 
 from transformers import CLIPTokenizerFast, ProcessorMixin
 from transformers.models.auto.processing_auto import processor_class_from_name
@@ -48,9 +49,10 @@
 @require_vision
 @require_torch
 class ProcessorTesterMixin:
-    image_data_arg_name = "pixel_values"
-    text_data_arg_name = "input_ids"
     processor_class = None
+    text_data_arg_name = "input_ids"
+    images_data_arg_name = "pixel_values"
+    videos_data_arg_name = "pixel_values_videos"
 
     def prepare_processor_dict(self):
         return {}
@@ -90,6 +92,13 @@ def prepare_image_inputs(self):
         image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
         return image_inputs
 
+    @require_vision
+    def prepare_video_inputs(self):
+        video_file = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
+        )
+        return [np.load(video_file)]
+
     def test_processor_to_json_string(self):
         processor = self.get_processor()
         obj = json.loads(processor.to_json_string())
@@ -129,43 +138,69 @@ def skip_processor_without_typed_kwargs(self, processor):
     def test_tokenizer_defaults_preserved_by_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_components = self.prepare_components()
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
 
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 117)
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 117)
 
     @require_torch
     @require_vision
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234))
+        processor_components = self.prepare_components()
+        processor_components["image_processor"] = self.get_component(
+            "image_processor", size=(234, 234), crop_size=(234, 234)
+        )
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(**processor_components)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 234)
+
+    @require_torch
+    @require_vision
+    def test_video_processor_defaults_preserved_by_kwargs(self):
+        if "video_processor" not in self.processor_class.attributes:
+            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", size=(234, 234), crop_size=(234, 234))
+        video_processor = self.get_component("video_processor", size=(234, 234), crop_size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
 
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor = self.processor_class(
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            video_processor=video_processor,
+        )
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
+        video_input = self.prepare_video_inputs()
 
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(len(inputs[self.image_data_arg_name][0][0]), 234)
+        inputs = processor(text=input_str, images=image_input, videos=video_input, return_tensors="pt")
+        self.assertEqual(inputs[self.videos_data_arg_name].shape[-1], 234)
 
     @require_vision
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", padding="longest")
+        processor_components = self.prepare_components()
+        processor_components["tokenizer"] = self.get_component("tokenizer", padding="longest")
 
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
@@ -173,34 +208,35 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         inputs = processor(
             text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
         )
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 112)
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 112)
 
     @require_torch
     @require_vision
     def test_kwargs_overrides_default_image_processor_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+        processor_components = self.prepare_components()
+        processor_components["image_processor"] = self.get_component("image_processor", size=(234, 234))
+        processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
 
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, size=[224, 224])
-        self.assertEqual(len(inputs[self.image_data_arg_name][0][0]), 224)
+        inputs = processor(
+            text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224), return_tensors="pt"
+        )
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 224)
 
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
@@ -210,22 +246,21 @@ def test_unstructured_kwargs(self):
             images=image_input,
             return_tensors="pt",
             size={"height": 214, "width": 214},
+            crop_size={"height": 214, "width": 214},
             padding="max_length",
             max_length=76,
         )
 
-        self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214)
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
 
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = ["lower newer", "upper older longer string"]
@@ -235,23 +270,21 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             size={"height": 214, "width": 214},
+            crop_size={"height": 214, "width": 214},
             padding="longest",
             max_length=76,
         )
 
-        self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214)
-
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 6)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertEqual(len(inputs[self.text_data_arg_name][0]), len(inputs[self.text_data_arg_name][1]))
 
     @require_torch
     @require_vision
     def test_doubly_passed_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = ["lower newer"]
@@ -262,6 +295,8 @@ def test_doubly_passed_kwargs(self):
                 images=image_input,
                 images_kwargs={"size": {"height": 222, "width": 222}},
                 size={"height": 214, "width": 214},
+                crop_size={"height": 214, "width": 214},
+                return_tensors="pt",
             )
 
     @require_torch
@@ -269,10 +304,8 @@ def test_doubly_passed_kwargs(self):
     def test_structured_kwargs_nested(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
@@ -281,42 +314,44 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
+            "images_kwargs": {
+                "size": {"height": 214, "width": 214},
+                "crop_size": {"height": 214, "width": 214},
+            },
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
-        self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214)
-
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
 
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_components = self.prepare_components()
+        processor = self.processor_class(**processor_components)
         self.skip_processor_without_typed_kwargs(processor)
+
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
+            "images_kwargs": {
+                "size": {"height": 214, "width": 214},
+                "crop_size": {"height": 214, "width": 214},
+            },
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs[self.image_data_arg_name].shape[2], 214)
-
-        self.assertEqual(len(inputs[self.text_data_arg_name][0]), 76)
+        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
 
 
 class MyProcessor(ProcessorMixin):

From 8c36cfb46f51259313bb82941652827cbe20852a Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Mon, 19 Aug 2024 15:51:01 +0800
Subject: [PATCH 05/10] address @zucchini's comments

---
 .../models/clipseg/processing_clipseg.py      |  7 +++--
 .../models/nougat/processing_nougat.py        | 30 +++++++++++++++++--
 .../models/owlv2/processing_owlv2.py          | 11 +++----
 .../models/owlvit/processing_owlvit.py        | 11 +++----
 src/transformers/processing_utils.py          |  6 ++--
 5 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index 7219fc7d4831ea..f99a8231fe343d 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -21,8 +21,9 @@
 from typing import List, Optional, Union
 
 from ...image_utils import ImageInput
+from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
 if sys.version_info >= (3, 11):
@@ -106,7 +107,7 @@ def __call__(
                 (C, H, W), where C is a number of channels, H and W are image height and width.
 
         Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
@@ -156,7 +157,7 @@ def __call__(
             }
             return encoding
         else:
-            return BatchEncoding(
+            return BatchFeature(
                 data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"].get("return_tensors")
             )
 
diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
index cbb43449dbe355..138085e8b73ce9 100644
--- a/src/transformers/models/nougat/processing_nougat.py
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -71,8 +71,8 @@ class NougatProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
+    image_processor_class = "NougatImageProcessor"
+    tokenizer_class = "NougatTokenizerFast"
 
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
@@ -86,6 +86,32 @@ def __call__(
         videos=None,
         **kwargs: Unpack[NougatProcessorKwargs],
     ):
+        """
+        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
+        `kwargs` arguments to NougatTokenizerFast's [`~NougatTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        NougatImageProcessor's [`~NougatImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
+            `List[torch.Tensor]`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **labels** -- List of label token ids to be fed to a model. Returned when both `text` and `images` are not `None`.
+        """
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 36af9c29c69da7..ba8be3c115e14a 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -22,8 +22,9 @@
 import numpy as np
 
 from ...image_utils import ImageInput
+from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_flax_available, is_tf_available, is_torch_available
 
 
@@ -105,7 +106,7 @@ def __call__(
                 should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
 
         Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
@@ -172,12 +173,12 @@ def __call__(
             else:
                 raise ValueError("Target return tensor type could not be returned")
 
-            encoding = BatchEncoding()
+            encoding = BatchFeature()
             encoding["input_ids"] = input_ids
             encoding["attention_mask"] = attention_mask
 
         if query_images is not None:
-            encoding = BatchEncoding()
+            encoding = BatchFeature()
             query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values
             encoding["query_pixel_values"] = query_pixel_values
 
@@ -193,7 +194,7 @@ def __call__(
         elif text is not None or query_images is not None:
             return encoding
         else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
 
     # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2
     def post_process_object_detection(self, *args, **kwargs):
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index fe6dcc96907d40..5c676ce8ea0c4f 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -23,8 +23,9 @@
 import numpy as np
 
 from ...image_utils import ImageInput
+from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_flax_available, is_tf_available, is_torch_available
 
 
@@ -121,7 +122,7 @@ def __call__(
                 should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
 
         Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
@@ -189,12 +190,12 @@ def __call__(
             else:
                 raise ValueError("Target return tensor type could not be returned")
 
-            encoding = BatchEncoding()
+            encoding = BatchFeature()
             encoding["input_ids"] = input_ids
             encoding["attention_mask"] = attention_mask
 
         if query_images is not None:
-            encoding = BatchEncoding()
+            encoding = BatchFeature()
             query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values
             encoding["query_pixel_values"] = query_pixel_values
 
@@ -210,7 +211,7 @@ def __call__(
         elif text is not None or query_images is not None:
             return encoding
         else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
 
     def post_process(self, *args, **kwargs):
         """
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 98d0ad5f5bb70b..f13a45ae66bd9d 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -1013,8 +1013,10 @@ def __call__(
             )
         if len(args) > len(self.optional_call_args):
             raise ValueError(
-                f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call but received {len(args)}."
-                "Passing positional arguments to the processor call is not recommended"
+                f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call"
+                f"which will be matched with {' '.join(self.optional_call_args)} in the order they are passed."
+                f"However, got {len(args)} positional arguments instead."
+                "Please pass all arguments as keyword arguments instead (e.g. `processor(arg_name_1=..., arg_name_2=...))`."
             )
         return {arg_name: arg_value for arg_value, arg_name in zip(args, self.optional_call_args)}
 

From 3a2f7ef0a56a41910cf4a480a12a83fdf15ee61a Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Mon, 19 Aug 2024 16:00:55 +0800
Subject: [PATCH 06/10] fix docs

---
 src/transformers/models/clipseg/processing_clipseg.py | 11 ++++++-----
 src/transformers/models/nougat/processing_nougat.py   |  8 ++++----
 src/transformers/models/owlv2/processing_owlv2.py     |  3 ++-
 src/transformers/models/owlvit/processing_owlvit.py   |  3 ++-
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index f99a8231fe343d..3c3995e46ac7e3 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -20,8 +20,8 @@
 import warnings
 from typing import List, Optional, Union
 
-from ...image_utils import ImageInput
 from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
@@ -109,11 +109,12 @@ def __call__(
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None` and `visual_prompt` is `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
+              `None`) and `visual_prompt` is `None`.
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **conditional_pixel_values** -- Conditional pixel values to be fed to a model. Returned when `visual_prompt` is not `None`.
         """
 
         output_kwargs = self._merge_kwargs(
@@ -145,7 +146,7 @@ def __call__(
                 "pixel_values": image_features.pixel_values,
                 "conditional_pixel_values": prompt_features.pixel_values,
             }
-            return encoding
+            return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
         elif text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values
             return encoding
@@ -155,7 +156,7 @@ def __call__(
             encoding = {
                 "conditional_pixel_values": prompt_features.pixel_values,
             }
-            return encoding
+            return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
         else:
             return BatchFeature(
                 data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"].get("return_tensors")
diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
index 138085e8b73ce9..21baf3e3cc1a22 100644
--- a/src/transformers/models/nougat/processing_nougat.py
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -94,14 +94,14 @@ def __call__(
         of the above two methods for more information.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
             images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
             `List[torch.Tensor]`, *optional*):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index ba8be3c115e14a..1844c7237f98cf 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -21,8 +21,8 @@
 
 import numpy as np
 
-from ...image_utils import ImageInput
 from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_flax_available, is_tf_available, is_torch_available
@@ -112,6 +112,7 @@ def __call__(
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             Owlv2ProcessorKwargs,
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 5c676ce8ea0c4f..03c530744c7dfe 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -22,8 +22,8 @@
 
 import numpy as np
 
-from ...image_utils import ImageInput
 from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_flax_available, is_tf_available, is_torch_available
@@ -128,6 +128,7 @@ def __call__(
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`.
         """
 
         output_kwargs = self._merge_kwargs(

From a280b3ac41f1adcb6bd178875c622f2496d676bb Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Mon, 19 Aug 2024 16:07:26 +0800
Subject: [PATCH 07/10] rm video testing

---
 tests/test_processing_common.py | 31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index bad0eb8cd6b72b..05cc96d3ce76dc 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -26,7 +26,6 @@
 import unittest
 
 import numpy as np
-from huggingface_hub import hf_hub_download
 
 from transformers import CLIPTokenizerFast, ProcessorMixin
 from transformers.models.auto.processing_auto import processor_class_from_name
@@ -92,13 +91,6 @@ def prepare_image_inputs(self):
         image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
         return image_inputs
 
-    @require_vision
-    def prepare_video_inputs(self):
-        video_file = hf_hub_download(
-            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
-        )
-        return [np.load(video_file)]
-
     def test_processor_to_json_string(self):
         processor = self.get_processor()
         obj = json.loads(processor.to_json_string())
@@ -169,29 +161,6 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
         self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 234)
 
-    @require_torch
-    @require_vision
-    def test_video_processor_defaults_preserved_by_kwargs(self):
-        if "video_processor" not in self.processor_class.attributes:
-            self.skipTest(f"video_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234), crop_size=(234, 234))
-        video_processor = self.get_component("video_processor", size=(234, 234), crop_size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer,
-            image_processor=image_processor,
-            video_processor=video_processor,
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        video_input = self.prepare_video_inputs()
-
-        inputs = processor(text=input_str, images=image_input, videos=video_input, return_tensors="pt")
-        self.assertEqual(inputs[self.videos_data_arg_name].shape[-1], 234)
-
     @require_vision
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs(self):

From ca925cc872a147ca501e15f53d8174e3e57a824d Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Tue, 20 Aug 2024 19:34:07 +0800
Subject: [PATCH 08/10] make processor call implementations simpler too

---
 .../models/clipseg/processing_clipseg.py      | 33 +++++--------------
 .../models/nougat/processing_nougat.py        | 27 ++++++++-------
 .../models/owlv2/processing_owlv2.py          | 31 +++++++++--------
 .../models/owlvit/processing_owlvit.py        | 29 ++++++++--------
 4 files changed, 49 insertions(+), 71 deletions(-)

diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index 3c3995e46ac7e3..55326d6147243a 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -85,7 +85,7 @@ def __call__(
         audio=None,
         videos=None,
         **kwargs: Unpack[CLIPSegProcessorKwargs],
-    ):
+    ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -132,35 +132,18 @@ def __call__(
         if text is not None and visual_prompt is not None:
             raise ValueError("You have to specify exactly one type of prompt. Either text or visual prompt.")
 
+        data = {}
         if text is not None:
-            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
+            text_features = self.tokenizer(text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
         if visual_prompt is not None:
-            prompt_features = self.image_processor(visual_prompt, **output_kwargs["images_kwargs"])
-
+            prompt_image_features = self.image_processor(visual_prompt, **output_kwargs["images_kwargs"])
+            data["conditional_pixel_values"] = prompt_image_features.pixel_values
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+            data["pixel_values"] = image_features.pixel_values
 
-        if visual_prompt is not None and images is not None:
-            encoding = {
-                "pixel_values": image_features.pixel_values,
-                "conditional_pixel_values": prompt_features.pixel_values,
-            }
-            return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
-        elif text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif text is not None:
-            return encoding
-        elif visual_prompt is not None:
-            encoding = {
-                "conditional_pixel_values": prompt_features.pixel_values,
-            }
-            return BatchFeature(data=encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
-        else:
-            return BatchFeature(
-                data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"].get("return_tensors")
-            )
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
index 21baf3e3cc1a22..638659ff49f6af 100644
--- a/src/transformers/models/nougat/processing_nougat.py
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -19,6 +19,7 @@
 import sys
 from typing import List, Optional, Union
 
+from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
@@ -85,7 +86,7 @@ def __call__(
         audio=None,
         videos=None,
         **kwargs: Unpack[NougatProcessorKwargs],
-    ):
+    ) -> BatchFeature:
         """
         Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
         `kwargs` arguments to NougatTokenizerFast's [`~NougatTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -105,12 +106,12 @@ def __call__(
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **labels** -- List of token ids to be fed to a model. Returned when both `text` and `images` are not `None`.
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None` and `images` is `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-            - **labels** -- List of label token ids to be fed to a model. Returned when both `text` and `images` are not `None`.
         """
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
@@ -123,18 +124,16 @@ def __call__(
         # Temporary fix for "paddding_side" in init_kwargs
         _ = output_kwargs["text_kwargs"].pop("padding_side", None)
 
-        if images is not None:
-            inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+        data = {}
         if text is not None:
-            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
-        if text is None:
-            return inputs
-        elif images is None:
-            return encodings
-        else:
-            inputs["labels"] = encodings["input_ids"]
-            return inputs
+            text_features = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            data.update(text_features)
+        if images is not None:
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+            data.update(image_features)
+            if "input_ids" in data:
+                data["labels"] = data.pop("input_ids")
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 1844c7237f98cf..5189bea0e6b41c 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -17,6 +17,7 @@
 """
 
 import sys
+import warnings
 from typing import List, Optional, Union
 
 import numpy as np
@@ -83,7 +84,7 @@ def __call__(
         audio=None,
         videos=None,
         **kwargs: Unpack[Owlv2ProcessorKwargs],
-    ):
+    ) -> BatchFeature:
         """
         Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
         `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
@@ -114,6 +115,7 @@ def __call__(
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
             - **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`.
         """
+
         output_kwargs = self._merge_kwargs(
             Owlv2ProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
@@ -128,6 +130,12 @@ def __call__(
             raise ValueError(
                 "You have to specify at least one text or query image or image. All three cannot be none."
             )
+        if text is not None and query_images is not None:
+            warnings.warn(
+                "Query images will override the text prompt. In the future, this will raise an error.", FutureWarning
+            )
+
+        data = {}
 
         if text is not None:
             if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)):
@@ -174,28 +182,19 @@ def __call__(
             else:
                 raise ValueError("Target return tensor type could not be returned")
 
-            encoding = BatchFeature()
-            encoding["input_ids"] = input_ids
-            encoding["attention_mask"] = attention_mask
+            data["input_ids"] = input_ids
+            data["attention_mask"] = attention_mask
 
         if query_images is not None:
-            encoding = BatchFeature()
             query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values
-            encoding["query_pixel_values"] = query_pixel_values
+            # Query images always override the text prompt
+            data = {"query_pixel_values": query_pixel_values}
 
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+            data["pixel_values"] = image_features.pixel_values
 
-        if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif query_images is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif text is not None or query_images is not None:
-            return encoding
-        else:
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+        return BatchFeature(data=data, tensor_type=return_tensors)
 
     # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.post_process_object_detection with OWLViT->OWLv2
     def post_process_object_detection(self, *args, **kwargs):
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 03c530744c7dfe..5860f1043625f6 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -99,7 +99,7 @@ def __call__(
         audio=None,
         videos=None,
         **kwargs: Unpack[OwlViTProcessorKwargs],
-    ):
+    ) -> BatchFeature:
         """
         Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
         `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
@@ -145,6 +145,12 @@ def __call__(
             raise ValueError(
                 "You have to specify at least one text or query image or image. All three cannot be none."
             )
+        if text is not None and query_images is not None:
+            warnings.warn(
+                "Query images will override the text prompt. In the future, this will raise an error.", FutureWarning
+            )
+
+        data = {}
 
         if text is not None:
             if isinstance(text, str) or (isinstance(text, List) and not isinstance(text[0], List)):
@@ -191,28 +197,19 @@ def __call__(
             else:
                 raise ValueError("Target return tensor type could not be returned")
 
-            encoding = BatchFeature()
-            encoding["input_ids"] = input_ids
-            encoding["attention_mask"] = attention_mask
+            data["input_ids"] = input_ids
+            data["attention_mask"] = attention_mask
 
         if query_images is not None:
-            encoding = BatchFeature()
             query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values
-            encoding["query_pixel_values"] = query_pixel_values
+            # Query images always override the text prompt
+            data = {"query_pixel_values": query_pixel_values}
 
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+            data["pixel_values"] = image_features.pixel_values
 
-        if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif query_images is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif text is not None or query_images is not None:
-            return encoding
-        else:
-            return BatchFeature(data=dict(**image_features), tensor_type=return_tensors)
+        return BatchFeature(data=data, tensor_type=return_tensors)
 
     def post_process(self, *args, **kwargs):
         """

From 71a7ee109345438a27344e4577a38be0bddb9e34 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Wed, 21 Aug 2024 00:47:34 +0800
Subject: [PATCH 09/10] fix test for clipseg and add more tests for owl models

---
 tests/models/clipseg/test_processor_clipseg.py |  2 +-
 tests/models/owlv2/test_processor_owlv2.py     | 17 +++++++++++++++++
 tests/models/owlvit/test_processor_owlvit.py   | 15 +++++++++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py
index a535219a23fb0c..7ac937bfa1ff99 100644
--- a/tests/models/clipseg/test_processor_clipseg.py
+++ b/tests/models/clipseg/test_processor_clipseg.py
@@ -187,7 +187,7 @@ def test_processor_visual_prompt(self):
 
         inputs = processor(images=image_input, visual_prompt=visual_prompt_input)
 
-        self.assertListEqual(list(inputs.keys()), ["pixel_values", "conditional_pixel_values"])
+        self.assertListEqual(list(inputs.keys()), ["conditional_pixel_values", "pixel_values"])
 
         # test if it raises when no input is passed
         with pytest.raises(ValueError):
diff --git a/tests/models/owlv2/test_processor_owlv2.py b/tests/models/owlv2/test_processor_owlv2.py
index b8f8b5d26cfe41..eadbb7c074fee9 100644
--- a/tests/models/owlv2/test_processor_owlv2.py
+++ b/tests/models/owlv2/test_processor_owlv2.py
@@ -1,6 +1,8 @@
 import tempfile
 import unittest
 
+import pytest
+
 from transformers import Owlv2Processor
 from transformers.testing_utils import require_scipy
 
@@ -16,3 +18,18 @@ def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
         processor = self.processor_class.from_pretrained(self.from_pretrained_id)
         processor.save_pretrained(self.tmpdirname)
+
+    def test_processor_query_images_positional(self):
+        processor_components = self.prepare_components()
+        processor = Owlv2Processor(**processor_components)
+
+        image_input = self.prepare_image_inputs()
+        query_images = self.prepare_image_inputs()
+
+        inputs = processor(None, image_input, query_images)
+
+        self.assertListEqual(list(inputs.keys()), ["query_pixel_values", "pixel_values"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()
diff --git a/tests/models/owlvit/test_processor_owlvit.py b/tests/models/owlvit/test_processor_owlvit.py
index 3fadfac0046002..698882233b875d 100644
--- a/tests/models/owlvit/test_processor_owlvit.py
+++ b/tests/models/owlvit/test_processor_owlvit.py
@@ -258,3 +258,18 @@ def test_tokenizer_decode(self):
         decoded_tok = tokenizer.batch_decode(predicted_ids)
 
         self.assertListEqual(decoded_tok, decoded_processor)
+
+    def test_processor_query_images_positional(self):
+        processor_components = self.prepare_components()
+        processor = OwlViTProcessor(**processor_components)
+
+        image_input = self.prepare_image_inputs()
+        query_images = self.prepare_image_inputs()
+
+        inputs = processor(None, image_input, query_images)
+
+        self.assertListEqual(list(inputs.keys()), ["query_pixel_values", "pixel_values"])
+
+        # test if it raises when no input is passed
+        with pytest.raises(ValueError):
+            processor()

From 274b61573a16be1cfab92d11d38448c29f914da1 Mon Sep 17 00:00:00 2001
From: Franz Louis Cesista <franzlouiscesista@gmail.com>
Date: Wed, 21 Aug 2024 01:04:46 +0800
Subject: [PATCH 10/10] fix test for clipseg

---
 tests/models/clipseg/test_processor_clipseg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/clipseg/test_processor_clipseg.py b/tests/models/clipseg/test_processor_clipseg.py
index 7ac937bfa1ff99..2359201f7c9793 100644
--- a/tests/models/clipseg/test_processor_clipseg.py
+++ b/tests/models/clipseg/test_processor_clipseg.py
@@ -204,7 +204,7 @@ def test_processor_visual_prompt_positional(self):
 
         inputs = processor(None, image_input, visual_prompt_input)
 
-        self.assertListEqual(list(inputs.keys()), ["pixel_values", "conditional_pixel_values"])
+        self.assertListEqual(list(inputs.keys()), ["conditional_pixel_values", "pixel_values"])
 
         # test if it raises when no input is passed
         with pytest.raises(ValueError):