From 51c201ac45fa0ebd91791f558987f5d6bc4f0ca0 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 15 Jul 2024 07:19:58 +0000
Subject: [PATCH 01/21] fix typo

---
 docs/source/en/model_doc/grounding-dino.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index d258f492abf8b5..609572a2393f01 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -45,19 +45,19 @@ import requests
 
 import torch
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, 
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 
 model_id = "IDEA-Research/grounding-dino-tiny"
 
 processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)
 
 image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(image_url, stream=True).raw)
 # Check for cats and remote controls
 text = "a cat. a remote control."
 
-inputs = processor(images=image, text=text, return_tensors="pt").to(device)
+inputs = processor(images=image, text=text, return_tensors="pt")
 with torch.no_grad():
     outputs = model(**inputs)
 

From 9dd38e20bcebfa3f6cdf812f79f4a50fe0104363 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 15 Jul 2024 07:20:07 +0000
Subject: [PATCH 02/21] uniform kwargs

---
 .../processing_grounding_dino.py              | 74 +++++++++++--------
 1 file changed, 44 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 44b99811d931ce..982b82b37ffe5f 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,12 +16,20 @@
 Processor class for Grounding DINO.
 """
 
+import sys
 from typing import List, Optional, Tuple, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessorMixin
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+
+
+if sys.version_info >= (3, 11):
+    from typing import Unpack
+else:
+    from typing_extensions import Unpack
+
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, is_torch_available
 
@@ -56,6 +64,26 @@ def get_phrases_from_posmap(posmaps, input_ids):
     return token_ids
 
 
+class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "truncation": None,
+            "max_length": None,
+            "stride": 0,
+            "pad_to_multiple_of": None,
+            "return_attention_mask": None,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": True,
+            "return_length": False,
+            "verbose": True,
+        }
+    }
+
+
 class GroundingDinoProcessor(ProcessorMixin):
     r"""
     Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a
@@ -83,21 +111,8 @@ def __call__(
         self,
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_token_type_ids: bool = True,
-        return_length: bool = False,
-        verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
+        **kwargs: Unpack[GroundingDinoProcessorKwargs],
     ) -> BatchEncoding:
         """
         This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
@@ -108,30 +123,29 @@ def __call__(
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
+        output_kwargs = self._merge_kwargs(
+            GroundingDinoProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # BC for explicit return_tensors
+        if "return_tensors" in output_kwargs["common_kwargs"]:
+            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
+
         # Get only text
         if images is not None:
-            encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
+            encoding_image_processor = self.image_processor(
+                images, return_tensors=return_tensors, **output_kwargs["images_kwargs"]
+            )
         else:
             encoding_image_processor = BatchFeature()
 
         if text is not None:
             text_encoding = self.tokenizer(
                 text=text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_token_type_ids=return_token_type_ids,
-                return_length=return_length,
-                verbose=verbose,
                 return_tensors=return_tensors,
-                **kwargs,
+                **output_kwargs["text_kwargs"],
             )
         else:
             text_encoding = BatchEncoding()

From ebc386282dccc725a1a7be9ae1edab25eda1a977 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 15 Jul 2024 07:24:45 +0000
Subject: [PATCH 03/21] make style

---
 .../models/grounding_dino/processing_grounding_dino.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 982b82b37ffe5f..1020820efb8050 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -30,7 +30,7 @@
 else:
     from typing_extensions import Unpack
 
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import TensorType, is_torch_available
 
 

From c6dc44528b39f8f58f072d2db912844ec501b9a3 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 15 Jul 2024 13:05:22 +0000
Subject: [PATCH 04/21] add comments

---
 .../processing_grounding_dino.py              | 17 +++++-----
 src/transformers/processing_utils.py          | 28 ++++++++++++++++
 .../test_processor_grounding_dino.py          | 33 ++++++++++++++++++-
 3 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 1020820efb8050..fe80864f8e0d09 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -17,7 +17,7 @@
 """
 
 import sys
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
@@ -69,18 +69,18 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
         "text_kwargs": {
             "add_special_tokens": True,
             "padding": False,
-            "truncation": None,
-            "max_length": None,
             "stride": 0,
-            "pad_to_multiple_of": None,
-            "return_attention_mask": None,
             "return_overflowing_tokens": False,
             "return_special_tokens_mask": False,
             "return_offsets_mapping": False,
-            "return_token_type_ids": True,
+            "return_token_type_ids": False,
             "return_length": False,
             "verbose": True,
-        }
+        },
+        "images_kwargs": {
+            "do_convert_annotations": True,
+            "do_resize": True,
+        },
     }
 
 
@@ -111,7 +111,8 @@ def __call__(
         self,
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        audio=None,
+        videos=None,
         **kwargs: Unpack[GroundingDinoProcessorKwargs],
     ) -> BatchEncoding:
         """
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 7062a7699a79f7..2e724eb2264a0a 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -20,6 +20,7 @@
 import inspect
 import json
 import os
+import pathlib
 import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union
@@ -40,6 +41,7 @@
 )
 from .utils import (
     PROCESSOR_NAME,
+    ExplicitEnum,
     PushToHubMixin,
     TensorType,
     add_model_info_to_auto_map,
@@ -56,6 +58,14 @@
 
 logger = logging.get_logger(__name__)
 
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
+
 # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
 transformers_module = direct_transformers_import(Path(__file__).parent)
 
@@ -128,6 +138,12 @@ class ImagesKwargs(TypedDict, total=False):
     class methods and docstrings.
 
     Attributes:
+        annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+            List of annotations associated with the image or batch of images.
+        return_segmentation_masks (`bool`, *optional*):
+            Whether to return segmentation masks.
+        masks_path (`str` or `pathlib.Path`, *optional*):
+            Path to the directory containing the segmentation masks.
         do_resize (`bool`, *optional*):
             Whether to resize the image.
         size (`Dict[str, int]`, *optional*):
@@ -144,6 +160,8 @@ class methods and docstrings.
             Scale factor to use if rescaling the image.
         do_normalize (`bool`, *optional*):
             Whether to normalize the image.
+        do_convert_annotations (`bool`, *optional*):
+            Whether to convert the annotations to the format expected by the model.
         image_mean (`float` or `List[float]`, *optional*):
             Mean to use if normalizing the image.
         image_std (`float` or `List[float]`, *optional*):
@@ -152,12 +170,19 @@ class methods and docstrings.
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the image.
+        format (`str` or `AnnotationFormat`, *optional*):
+            Format of the annotations.
         data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the output image.
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the input image.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to.
     """
 
+    annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
+    return_segmentation_masks: Optional[bool]
+    masks_path: Optional[Union[str, pathlib.Path]]
     do_resize: Optional[bool]
     size: Optional[Dict[str, int]]
     size_divisor: Optional[int]
@@ -166,12 +191,15 @@ class methods and docstrings.
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
+    do_convert_annotations: Optional[bool]
     image_mean: Optional[Union[float, List[float]]]
     image_std: Optional[Union[float, List[float]]]
     do_pad: Optional[bool]
     do_center_crop: Optional[bool]
+    format: Optional[Union[str, AnnotationFormat]]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
+    pad_size: Optional[Dict[str, int]]
 
 
 class VideosKwargs(TypedDict, total=False):
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index a788d09ca7eed1..b7a259f0c31526 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -26,6 +26,8 @@
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
 
+from ...test_processing_common import ProcessorTesterMixin
+
 
 if is_torch_available():
     import torch
@@ -40,7 +42,9 @@
 
 @require_torch
 @require_vision
-class GroundingDinoProcessorTest(unittest.TestCase):
+class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    processor_class = GroundingDinoProcessor
+
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
@@ -251,3 +255,30 @@ def test_model_input_names(self):
         inputs = processor(text=input_str, images=image_input)
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 11)

From 16ddefd33d42be09015f6e06916d1f170ed050ac Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 16 Jul 2024 01:00:01 +0000
Subject: [PATCH 05/21] remove return_tensors

---
 .../models/grounding_dino/processing_grounding_dino.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index fe80864f8e0d09..4aa081fd70f224 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -122,7 +122,7 @@ def __call__(
         Please refer to the docstring of the above two methods for more information.
         """
         if images is None and text is None:
-            raise ValueError("You have to specify either images or text.")
+            raise ValueError("You must specify either text or images.")
 
         output_kwargs = self._merge_kwargs(
             GroundingDinoProcessorKwargs,
@@ -130,14 +130,10 @@ def __call__(
             **kwargs,
         )
 
-        # BC for explicit return_tensors
-        if "return_tensors" in output_kwargs["common_kwargs"]:
-            return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None)
-
         # Get only text
         if images is not None:
             encoding_image_processor = self.image_processor(
-                images, return_tensors=return_tensors, **output_kwargs["images_kwargs"]
+                images, **output_kwargs["common_kwargs"], **output_kwargs["images_kwargs"]
             )
         else:
             encoding_image_processor = BatchFeature()
@@ -145,7 +141,7 @@ def __call__(
         if text is not None:
             text_encoding = self.tokenizer(
                 text=text,
-                return_tensors=return_tensors,
+                **output_kwargs["common_kwargs"],
                 **output_kwargs["text_kwargs"],
             )
         else:

From 1f9a0eeb22e07fd6f0a8a904146e7186e4938c95 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 23 Jul 2024 01:39:42 +0000
Subject: [PATCH 06/21] remove common_kwargs from processor since it propagates

---
 .../models/grounding_dino/processing_grounding_dino.py         | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 4aa081fd70f224..c005f2d031087c 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -133,7 +133,7 @@ def __call__(
         # Get only text
         if images is not None:
             encoding_image_processor = self.image_processor(
-                images, **output_kwargs["common_kwargs"], **output_kwargs["images_kwargs"]
+                images, **output_kwargs["images_kwargs"]
             )
         else:
             encoding_image_processor = BatchFeature()
@@ -141,7 +141,6 @@ def __call__(
         if text is not None:
             text_encoding = self.tokenizer(
                 text=text,
-                **output_kwargs["common_kwargs"],
                 **output_kwargs["text_kwargs"],
             )
         else:

From 0696dcf60720e8aa9d94c519c3f3da14d5820bb2 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 23 Jul 2024 01:51:09 +0000
Subject: [PATCH 07/21] make style

---
 .../models/grounding_dino/processing_grounding_dino.py        | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index c005f2d031087c..9eccc7320ccc6b 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -132,9 +132,7 @@ def __call__(
 
         # Get only text
         if images is not None:
-            encoding_image_processor = self.image_processor(
-                images, **output_kwargs["images_kwargs"]
-            )
+            encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
         else:
             encoding_image_processor = BatchFeature()
 

From 850b9d5f6ea7224d6afd17311a20880d86f144d9 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Tue, 23 Jul 2024 02:06:37 +0000
Subject: [PATCH 08/21] return_token_type_ids to True

---
 .../models/grounding_dino/processing_grounding_dino.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 9eccc7320ccc6b..74cda0c5953915 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -73,7 +73,7 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
             "return_overflowing_tokens": False,
             "return_special_tokens_mask": False,
             "return_offsets_mapping": False,
-            "return_token_type_ids": False,
+            "return_token_type_ids": True,
             "return_length": False,
             "verbose": True,
         },

From c96c02b6ecdb0047c1ec19772244042ea3086831 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 00:46:39 +0000
Subject: [PATCH 09/21] revert the default imagekwargs since does not accept
 any value in the image processro

---
 .../models/grounding_dino/processing_grounding_dino.py          | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 74cda0c5953915..9928eacbeb5ce0 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -78,8 +78,6 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
             "verbose": True,
         },
         "images_kwargs": {
-            "do_convert_annotations": True,
-            "do_resize": True,
         },
     }
 

From 8cff6b609438691cb4cbc1195bd6f058d8794e4b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 01:08:01 +0000
Subject: [PATCH 10/21] revert processing_utils.py

---
 src/transformers/processing_utils.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 2e724eb2264a0a..7062a7699a79f7 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -20,7 +20,6 @@
 import inspect
 import json
 import os
-import pathlib
 import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union
@@ -41,7 +40,6 @@
 )
 from .utils import (
     PROCESSOR_NAME,
-    ExplicitEnum,
     PushToHubMixin,
     TensorType,
     add_model_info_to_auto_map,
@@ -58,14 +56,6 @@
 
 logger = logging.get_logger(__name__)
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotationFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
 # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
 transformers_module = direct_transformers_import(Path(__file__).parent)
 
@@ -138,12 +128,6 @@ class ImagesKwargs(TypedDict, total=False):
     class methods and docstrings.
 
     Attributes:
-        annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images.
-        return_segmentation_masks (`bool`, *optional*):
-            Whether to return segmentation masks.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
         do_resize (`bool`, *optional*):
             Whether to resize the image.
         size (`Dict[str, int]`, *optional*):
@@ -160,8 +144,6 @@ class methods and docstrings.
             Scale factor to use if rescaling the image.
         do_normalize (`bool`, *optional*):
             Whether to normalize the image.
-        do_convert_annotations (`bool`, *optional*):
-            Whether to convert the annotations to the format expected by the model.
         image_mean (`float` or `List[float]`, *optional*):
             Mean to use if normalizing the image.
         image_std (`float` or `List[float]`, *optional*):
@@ -170,19 +152,12 @@ class methods and docstrings.
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the image.
-        format (`str` or `AnnotationFormat`, *optional*):
-            Format of the annotations.
         data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the output image.
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the input image.
-        pad_size (`Dict[str, int]`, *optional*):
-            The size `{"height": int, "width" int}` to pad the images to.
     """
 
-    annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
-    return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
     do_resize: Optional[bool]
     size: Optional[Dict[str, int]]
     size_divisor: Optional[int]
@@ -191,15 +166,12 @@ class methods and docstrings.
     do_rescale: Optional[bool]
     rescale_factor: Optional[float]
     do_normalize: Optional[bool]
-    do_convert_annotations: Optional[bool]
     image_mean: Optional[Union[float, List[float]]]
     image_std: Optional[Union[float, List[float]]]
     do_pad: Optional[bool]
     do_center_crop: Optional[bool]
-    format: Optional[Union[str, AnnotationFormat]]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
-    pad_size: Optional[Dict[str, int]]
 
 
 class VideosKwargs(TypedDict, total=False):

From bb1f18bb3bb9230dd3acfa5349328daf749c5fbe Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 01:25:14 +0000
Subject: [PATCH 11/21] make style

---
 .../models/grounding_dino/processing_grounding_dino.py         | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 9928eacbeb5ce0..a09d21502b3c07 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -77,8 +77,7 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
             "return_length": False,
             "verbose": True,
         },
-        "images_kwargs": {
-        },
+        "images_kwargs": {},
     }
 
 

From a476c6ee88318ce40d73ea31e2dc2d4faa8ae410 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 01:47:47 +0000
Subject: [PATCH 12/21] add molbap's commit

---
 .../processing_grounding_dino.py              |  23 +-
 src/transformers/processing_utils.py          |  71 +++----
 tests/test_processing_common.py               | 200 ++++++++++++++++--
 3 files changed, 225 insertions(+), 69 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index a09d21502b3c07..167b5598bfa7b9 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,13 +16,14 @@
 Processor class for Grounding DINO.
 """
 
+import pathlib
 import sys
-from typing import List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 
 
 if sys.version_info >= (3, 11):
@@ -31,12 +32,19 @@
     from typing_extensions import Unpack
 
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
-from ...utils import TensorType, is_torch_available
+from ...utils import ExplicitEnum, TensorType, is_torch_available
 
 
 if is_torch_available():
     import torch
 
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
 
 def get_phrases_from_posmap(posmaps, input_ids):
     """Get token ids of phrases from posmaps and input_ids.
@@ -64,7 +72,16 @@ def get_phrases_from_posmap(posmaps, input_ids):
     return token_ids
 
 
+class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
+    annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
+    return_segmentation_masks: Optional[bool]
+    masks_path: Optional[Union[str, pathlib.Path]]
+    do_convert_annotations: Optional[bool]
+    format: Optional[Union[str, AnnotationFormat]]
+
+
 class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: GroundingDinoImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 7062a7699a79f7..d9f1e6f5efabde 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -150,6 +150,8 @@ class methods and docstrings.
             Standard deviation to use if normalizing the image.
         do_pad (`bool`, *optional*):
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the image.
         data_format (`ChannelDimension` or `str`, *optional*):
@@ -169,6 +171,7 @@ class methods and docstrings.
     image_mean: Optional[Union[float, List[float]]]
     image_std: Optional[Union[float, List[float]]]
     do_pad: Optional[bool]
+    pad_size: Optional[Dict[str, int]]
     do_center_crop: Optional[bool]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
@@ -320,7 +323,6 @@ class ProcessorMixin(PushToHubMixin):
     feature_extractor_class = None
     tokenizer_class = None
     _auto_class = None
-    valid_kwargs: List[str] = []
 
     # args have to match the attributes class attribute
     def __init__(self, *args, **kwargs):
@@ -649,15 +651,14 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
         processor_dict = processor_dict.copy()
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
 
-        # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
-        # If we don't pop, some specific kwargs will raise a warning
+        # Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`.
+        # We have to pop up some unused (but specific) arguments to make it work.
         if "processor_class" in processor_dict:
             del processor_dict["processor_class"]
 
         if "auto_map" in processor_dict:
             del processor_dict["auto_map"]
 
-        unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs)
         processor = cls(*args, **processor_dict)
 
         # Update processor with kwargs if needed
@@ -665,7 +666,6 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
             if hasattr(processor, key):
                 setattr(processor, key, kwargs.pop(key))
 
-        kwargs.update(unused_kwargs)
         logger.info(f"Processor {processor}")
         if return_unused_kwargs:
             return processor, kwargs
@@ -743,38 +743,34 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                 if modality_key in tokenizer_init_kwargs:
                     default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key]
         # now defaults kwargs are updated with the tokenizers defaults.
-        # pass defaults to output dictionary
         output_kwargs.update(default_kwargs)
 
+        # gather common kwargs and remove them from individual kwargs if present
+        common_kwargs = {
+            key: value
+            for key, value in kwargs.items()
+            if key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__
+            and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__
+            and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__
+            and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__
+        }
+
+        # ensure common kwargs are propagated to all relevant modalities
+        for key, value in common_kwargs.items():
+            for modality in output_kwargs:
+                if modality != "common_kwargs":
+                    output_kwargs[modality][key] = value
+
+        # remove common kwargs from the kwargs to process the rest
+        kwargs = {k: v for k, v in kwargs.items() if k not in common_kwargs}
+
         # update modality kwargs with passed kwargs
-        non_modality_kwargs = set(kwargs) - set(output_kwargs)
         for modality in output_kwargs:
             for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
-                # check if we received a structured kwarg dict or not to handle it correctly
-                if modality in kwargs:
-                    kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
-                    # check if this key was passed as a flat kwarg.
-                    if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
-                        raise ValueError(
-                            f"Keyword argument {modality_key} was passed two times: in a dictionary for {modality} and as a **kwarg."
-                        )
+                if modality in kwargs and modality_key in kwargs[modality]:
+                    output_kwargs[modality][modality_key] = kwargs[modality][modality_key]
                 elif modality_key in kwargs:
-                    kwarg_value = kwargs.pop(modality_key, "__empty__")
-                else:
-                    kwarg_value = "__empty__"
-                if kwarg_value != "__empty__":
-                    output_kwargs[modality][modality_key] = kwarg_value
-        # if something remains in kwargs, it belongs to common after flattening
-        if set(kwargs) & set(default_kwargs):
-            # here kwargs is dictionary-based since it shares keys with default set
-            [output_kwargs["common_kwargs"].update(subdict) for _, subdict in kwargs.items()]
-        else:
-            # here it's a flat dict
-            output_kwargs["common_kwargs"].update(kwargs)
-
-        # all modality-specific kwargs are updated with common kwargs
-        for modality in output_kwargs:
-            output_kwargs[modality].update(output_kwargs["common_kwargs"])
+                    output_kwargs[modality][modality_key] = kwargs[modality_key]
         return output_kwargs
 
     @classmethod
@@ -890,19 +886,6 @@ def model_input_names(self):
         first_attribute = getattr(self, self.attributes[0])
         return getattr(first_attribute, "model_input_names", None)
 
-    @staticmethod
-    def validate_init_kwargs(processor_config, valid_kwargs):
-        kwargs_from_config = processor_config.keys()
-        unused_kwargs = {}
-        unused_keys = set(kwargs_from_config) - set(valid_kwargs)
-        if unused_keys:
-            unused_key_str = ", ".join(unused_keys)
-            logger.warning(
-                f"Some kwargs in processor config are unused and will not have any effect: {unused_key_str}. "
-            )
-            unused_kwargs = {k: processor_config[k] for k in unused_keys}
-        return unused_kwargs
-
     def apply_chat_template(
         self,
         conversation: Union[List[Dict[str, str]]],
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index 074aa2f1d62545..e6128cde9bb503 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -16,6 +16,7 @@
 
 import inspect
 import json
+import random
 import tempfile
 
 
@@ -38,15 +39,31 @@
 from transformers.utils import is_vision_available
 
 
+global_rng = random.Random()
+
 if is_vision_available():
     from PIL import Image
 
     from transformers import CLIPImageProcessor
 
 
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
 @require_torch
 @require_vision
-@require_torch
 class ProcessorTesterMixin:
     processor_class = None
 
@@ -60,7 +77,10 @@ def get_component(self, attribute, **kwargs):
             component_class_name = component_class_name[0]
 
         component_class = processor_class_from_name(component_class_name)
-        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
+        if hasattr(self, "tmpdirname"):
+            component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
+        elif hasattr(self, "model_id"):
+            component = component_class.from_pretrained(self.model_id, **kwargs)  # noqa
 
         return component
 
@@ -126,13 +146,13 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
         self.assertEqual(len(inputs["input_ids"][0]), 117)
 
@@ -141,15 +161,15 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117)
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
         inputs = processor(text=input_str, images=image_input)
         self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
 
@@ -160,13 +180,15 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer", max_length=117)
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
+        inputs = processor(
+            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
+        )
         self.assertEqual(len(inputs["input_ids"][0]), 112)
 
     @require_torch
@@ -174,16 +196,17 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
     def test_kwargs_overrides_default_image_processor_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117)
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224])
+        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224], size=[224, 224])
         self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
 
     @require_torch
@@ -193,7 +216,8 @@ def test_unstructured_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -204,6 +228,7 @@ def test_unstructured_kwargs(self):
             images=image_input,
             return_tensors="pt",
             crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
             padding="max_length",
             max_length=76,
         )
@@ -218,7 +243,8 @@ def test_unstructured_kwargs_batched(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -229,10 +255,10 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
             padding="longest",
             max_length=76,
         )
-
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 6)
@@ -244,7 +270,8 @@ def test_doubly_passed_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -265,7 +292,8 @@ def test_structured_kwargs_nested(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -275,7 +303,7 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -294,7 +322,8 @@ def test_structured_kwargs_nested_from_dict(self):
 
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
@@ -303,7 +332,7 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -312,6 +341,133 @@ def test_structured_kwargs_nested_from_dict(self):
 
         self.assertEqual(len(inputs["input_ids"][0]), 76)
 
+    #  text + audio kwargs testing
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117, padding="max_length")
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 117)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 117)
+
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117)
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117)
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length")
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 112)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 112)
+
+    @require_torch
+    def test_unstructured_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer(max_length=117)
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer", max_length=117)
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        raw_speech = floats_list((3, 1000))
+        inputs = processor(
+            text=input_str,
+            audio=raw_speech,
+            return_tensors="pt",
+            padding="max_length",
+            max_length=76,
+        )
+
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 76)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 76)
+
+    @require_torch
+    def test_doubly_passed_kwargs_audio(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer()
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer"]
+        raw_speech = floats_list((3, 1000))
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                audio=raw_speech,
+                audio_kwargs={"padding": "max_length"},
+                padding="max_length",
+            )
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_audio_nested(self):
+        if "feature_extractor" not in self.processor_class.attributes:
+            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
+        feature_extractor = self.get_component("feature_extractor")
+        if hasattr(self, "get_tokenizer"):
+            tokenizer = self.get_tokenizer()
+        elif hasattr(self, "get_component"):
+            tokenizer = self.get_component("tokenizer")
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = "[TEST_PAD]"
+        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer"]
+        raw_speech = floats_list((3, 1000))
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "audio_kwargs": {"padding": "max_length", "max_length": 66},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, audio=raw_speech, **all_kwargs)
+        if "input_ids" in inputs:
+            self.assertEqual(len(inputs["input_ids"][0]), 76)
+        elif "labels" in inputs:
+            self.assertEqual(len(inputs["labels"][0]), 76)
+
 
 class MyProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]

From 81045219a22ed34a73cac3fe924908e57a9ebe53 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 02:40:01 +0000
Subject: [PATCH 13/21] fix typo

---
 .../models/grounding_dino/modeling_grounding_dino.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index dcdccc50cc116d..c33718bde54410 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -1580,7 +1580,7 @@ def _set_gradient_checkpointing(self, module, value=False):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
 
-            Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDinoTokenizer.__call__`] for details.
+            Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details.
 
         token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,

From 5d6a088566b743d64defbd6bc55cdf00cbbe985e Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 12:28:55 +0000
Subject: [PATCH 14/21] fix common processor

---
 src/transformers/processing_utils.py          | 35 +++++++++++--------
 .../test_processor_grounding_dino.py          | 10 +++++-
 tests/test_processing_common.py               | 14 ++++----
 3 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 8e0ab968d8589a..83ad01714dbceb 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -736,12 +736,12 @@ def _merge_kwargs(
         The order of operations is as follows:
             1) kwargs passed as before have highest priority to preserve BC.
                 ```python
-                high_priority_kwargs = {"crop_size" = (224, 224), "padding" = "max_length"}
+                high_priority_kwargs = {"crop_size" = {"height": 222, "width": 222}, "padding" = "max_length"}
                 processor(..., **high_priority_kwargs)
                 ```
             2) kwargs passed as modality-specific kwargs have second priority. This is the recommended API.
                 ```python
-                processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": (224, 224)}})
+                processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": {"height": 222, "width": 222}}})
                 ```
             3) kwargs passed during instantiation of a modality processor have fourth priority.
                 ```python
@@ -799,14 +799,20 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         output_kwargs.update(default_kwargs)
 
         # gather common kwargs and remove them from individual kwargs if present
-        common_kwargs = {
-            key: value
-            for key, value in kwargs.items()
-            if key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__
-            and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__
-            and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__
-            and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__
-        }
+        common_kwargs = {}
+        for key, value in kwargs.items():
+            if key == "common_kwargs":
+                for common_key, common_value in value.items():
+                    common_kwargs[common_key] = common_value
+            elif key in ["text_kwargs", "images_kwargs", "audio_kwargs", "videos_kwargs"]:
+                pass
+            elif (
+                key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__
+                and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__
+                and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__
+                and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__
+            ):
+                common_kwargs[key] = value
 
         # ensure common kwargs are propagated to all relevant modalities
         for key, value in common_kwargs.items():
@@ -820,10 +826,10 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         # update modality kwargs with passed kwargs
         for modality in output_kwargs:
             for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
-                if modality in kwargs and modality_key in kwargs[modality]:
-                    output_kwargs[modality][modality_key] = kwargs[modality][modality_key]
-                elif modality_key in kwargs:
+                if modality_key in kwargs:
                     output_kwargs[modality][modality_key] = kwargs[modality_key]
+                elif modality in kwargs and modality_key in kwargs[modality]:
+                    output_kwargs[modality][modality_key] = kwargs[modality][modality_key]
         return output_kwargs
 
     @classmethod
@@ -988,5 +994,4 @@ def apply_chat_template(
 ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub)
 if ProcessorMixin.push_to_hub.__doc__ is not None:
     ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format(
-        object="processor", object_class="AutoProcessor", object_files="processor files"
-    )
+        object="processor", object_class="AutoProcessor", object_
\ No newline at end of file
diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index b7a259f0c31526..448aa8f7fb6433 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -43,6 +43,7 @@
 @require_torch
 @require_vision
 class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+    from_pretrained_id = "IDEA-Research/grounding-dino-base"
     processor_class = GroundingDinoProcessor
 
     def setUp(self):
@@ -67,6 +68,13 @@ def setUp(self):
         with open(self.image_processor_file, "w", encoding="utf-8") as fp:
             json.dump(image_processor_map, fp)
 
+        image_processor = GroundingDinoImageProcessor()
+        tokenizer = BertTokenizer.from_pretrained(self.from_pretrained_id)
+
+        processor = GroundingDinoProcessor(image_processor, tokenizer)
+
+        processor.save_pretrained(self.tmpdirname)
+
         self.batch_size = 7
         self.num_queries = 5
         self.embed_dim = 5
@@ -281,4 +289,4 @@ def test_unstructured_kwargs_batched(self):
         )
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
-        self.assertEqual(len(inputs["input_ids"][0]), 11)
+        self.assertEqual(len(inputs["input_ids"][0]), 6)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index e6128cde9bb503..b43d48e530b8ce 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -277,13 +277,13 @@ def test_doubly_passed_kwargs(self):
 
         input_str = ["lower newer"]
         image_input = self.prepare_image_inputs()
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                images=image_input,
-                images_kwargs={"crop_size": {"height": 222, "width": 222}},
-                crop_size={"height": 214, "width": 214},
-            )
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            images_kwargs={"size": {"height": 222, "width": 222}},
+            size={"height": 35, "width": 35},
+        )
+        self.assertEqual(inputs["pixel_values"][0].shape[2], 35)
 
     @require_torch
     @require_vision

From d5b13d2beb4cec5f617bdf6accf40241983c43cb Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 24 Jul 2024 12:29:47 +0000
Subject: [PATCH 15/21] remain

---
 src/transformers/processing_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 83ad01714dbceb..372e54e9e2776a 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -994,4 +994,5 @@ def apply_chat_template(
 ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub)
 if ProcessorMixin.push_to_hub.__doc__ is not None:
     ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format(
-        object="processor", object_class="AutoProcessor", object_
\ No newline at end of file
+        object="processor", object_class="AutoProcessor", object_files="processor files"
+    )

From 1cf9139ef8dad3c48db6f47f6f5c8ed6e351d79d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 29 Jul 2024 12:27:59 +0000
Subject: [PATCH 16/21] Revert "add molbap's commit"

This reverts commit a476c6ee88318ce40d73ea31e2dc2d4faa8ae410.
---
 .../processing_grounding_dino.py              |  23 +-
 src/transformers/processing_utils.py          |  79 ++++---
 tests/test_processing_common.py               | 200 ++----------------
 3 files changed, 70 insertions(+), 232 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 167b5598bfa7b9..a09d21502b3c07 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,14 +16,13 @@
 Processor class for Grounding DINO.
 """
 
-import pathlib
 import sys
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
 
 
 if sys.version_info >= (3, 11):
@@ -32,19 +31,12 @@
     from typing_extensions import Unpack
 
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
-from ...utils import ExplicitEnum, TensorType, is_torch_available
+from ...utils import TensorType, is_torch_available
 
 
 if is_torch_available():
     import torch
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotationFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
 
 def get_phrases_from_posmap(posmaps, input_ids):
     """Get token ids of phrases from posmaps and input_ids.
@@ -72,16 +64,7 @@ def get_phrases_from_posmap(posmaps, input_ids):
     return token_ids
 
 
-class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
-    annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
-    return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
-    do_convert_annotations: Optional[bool]
-    format: Optional[Union[str, AnnotationFormat]]
-
-
 class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: GroundingDinoImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 372e54e9e2776a..9abb4b29fcd7ff 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -151,8 +151,6 @@ class methods and docstrings.
             Standard deviation to use if normalizing the image.
         do_pad (`bool`, *optional*):
             Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
-        pad_size (`Dict[str, int]`, *optional*):
-            The size `{"height": int, "width" int}` to pad the images to.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the image.
         data_format (`ChannelDimension` or `str`, *optional*):
@@ -172,7 +170,6 @@ class methods and docstrings.
     image_mean: Optional[Union[float, List[float]]]
     image_std: Optional[Union[float, List[float]]]
     do_pad: Optional[bool]
-    pad_size: Optional[Dict[str, int]]
     do_center_crop: Optional[bool]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
@@ -324,6 +321,7 @@ class ProcessorMixin(PushToHubMixin):
     feature_extractor_class = None
     tokenizer_class = None
     _auto_class = None
+    valid_kwargs: List[str] = []
 
     # args have to match the attributes class attribute
     def __init__(self, *args, **kwargs):
@@ -702,14 +700,15 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
         chat_template = kwargs.pop("chat_template", None)
 
-        # Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`.
-        # We have to pop up some unused (but specific) arguments to make it work.
+        # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
+        # If we don't pop, some specific kwargs will raise a warning
         if "processor_class" in processor_dict:
             del processor_dict["processor_class"]
 
         if "auto_map" in processor_dict:
             del processor_dict["auto_map"]
 
+        unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs)
         processor = cls(*args, **processor_dict)
         if chat_template is not None:
             setattr(processor, "chat_template", chat_template)
@@ -719,6 +718,7 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
             if hasattr(processor, key):
                 setattr(processor, key, kwargs.pop(key))
 
+        kwargs.update(unused_kwargs)
         logger.info(f"Processor {processor}")
         if return_unused_kwargs:
             return processor, kwargs
@@ -796,40 +796,38 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
                 if modality_key in tokenizer_init_kwargs:
                     default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key]
         # now defaults kwargs are updated with the tokenizers defaults.
+        # pass defaults to output dictionary
         output_kwargs.update(default_kwargs)
 
-        # gather common kwargs and remove them from individual kwargs if present
-        common_kwargs = {}
-        for key, value in kwargs.items():
-            if key == "common_kwargs":
-                for common_key, common_value in value.items():
-                    common_kwargs[common_key] = common_value
-            elif key in ["text_kwargs", "images_kwargs", "audio_kwargs", "videos_kwargs"]:
-                pass
-            elif (
-                key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__
-                and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__
-                and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__
-                and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__
-            ):
-                common_kwargs[key] = value
-
-        # ensure common kwargs are propagated to all relevant modalities
-        for key, value in common_kwargs.items():
-            for modality in output_kwargs:
-                if modality != "common_kwargs":
-                    output_kwargs[modality][key] = value
-
-        # remove common kwargs from the kwargs to process the rest
-        kwargs = {k: v for k, v in kwargs.items() if k not in common_kwargs}
-
         # update modality kwargs with passed kwargs
+        non_modality_kwargs = set(kwargs) - set(output_kwargs)
         for modality in output_kwargs:
             for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
-                if modality_key in kwargs:
-                    output_kwargs[modality][modality_key] = kwargs[modality_key]
-                elif modality in kwargs and modality_key in kwargs[modality]:
-                    output_kwargs[modality][modality_key] = kwargs[modality][modality_key]
+                # check if we received a structured kwarg dict or not to handle it correctly
+                if modality in kwargs:
+                    kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
+                    # check if this key was passed as a flat kwarg.
+                    if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
+                        raise ValueError(
+                            f"Keyword argument {modality_key} was passed two times: in a dictionary for {modality} and as a **kwarg."
+                        )
+                elif modality_key in kwargs:
+                    kwarg_value = kwargs.pop(modality_key, "__empty__")
+                else:
+                    kwarg_value = "__empty__"
+                if kwarg_value != "__empty__":
+                    output_kwargs[modality][modality_key] = kwarg_value
+        # if something remains in kwargs, it belongs to common after flattening
+        if set(kwargs) & set(default_kwargs):
+            # here kwargs is dictionary-based since it shares keys with default set
+            [output_kwargs["common_kwargs"].update(subdict) for _, subdict in kwargs.items()]
+        else:
+            # here it's a flat dict
+            output_kwargs["common_kwargs"].update(kwargs)
+
+        # all modality-specific kwargs are updated with common kwargs
+        for modality in output_kwargs:
+            output_kwargs[modality].update(output_kwargs["common_kwargs"])
         return output_kwargs
 
     @classmethod
@@ -945,6 +943,19 @@ def model_input_names(self):
         first_attribute = getattr(self, self.attributes[0])
         return getattr(first_attribute, "model_input_names", None)
 
+    @staticmethod
+    def validate_init_kwargs(processor_config, valid_kwargs):
+        kwargs_from_config = processor_config.keys()
+        unused_kwargs = {}
+        unused_keys = set(kwargs_from_config) - set(valid_kwargs)
+        if unused_keys:
+            unused_key_str = ", ".join(unused_keys)
+            logger.warning(
+                f"Some kwargs in processor config are unused and will not have any effect: {unused_key_str}. "
+            )
+            unused_kwargs = {k: processor_config[k] for k in unused_keys}
+        return unused_kwargs
+
     def apply_chat_template(
         self,
         conversation: Union[List[Dict[str, str]]],
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index b43d48e530b8ce..bb4d86d3f5a500 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -16,7 +16,6 @@
 
 import inspect
 import json
-import random
 import tempfile
 
 
@@ -39,31 +38,15 @@
 from transformers.utils import is_vision_available
 
 
-global_rng = random.Random()
-
 if is_vision_available():
     from PIL import Image
 
     from transformers import CLIPImageProcessor
 
 
-# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
-def floats_list(shape, scale=1.0, rng=None, name=None):
-    """Creates a random float32 tensor"""
-    if rng is None:
-        rng = global_rng
-
-    values = []
-    for batch_idx in range(shape[0]):
-        values.append([])
-        for _ in range(shape[1]):
-            values[-1].append(rng.random() * scale)
-
-    return values
-
-
 @require_torch
 @require_vision
+@require_torch
 class ProcessorTesterMixin:
     processor_class = None
 
@@ -77,10 +60,7 @@ def get_component(self, attribute, **kwargs):
             component_class_name = component_class_name[0]
 
         component_class = processor_class_from_name(component_class_name)
-        if hasattr(self, "tmpdirname"):
-            component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
-        elif hasattr(self, "model_id"):
-            component = component_class.from_pretrained(self.model_id, **kwargs)  # noqa
+        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
 
         return component
 
@@ -146,13 +126,13 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
+
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
         self.assertEqual(len(inputs["input_ids"][0]), 117)
 
@@ -161,15 +141,15 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234))
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
+
         inputs = processor(text=input_str, images=image_input)
         self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
 
@@ -180,15 +160,13 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-        inputs = processor(
-            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
-        )
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
         self.assertEqual(len(inputs["input_ids"][0]), 112)
 
     @require_torch
@@ -196,17 +174,16 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
     def test_kwargs_overrides_default_image_processor_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234))
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
         tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224], size=[224, 224])
+        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224])
         self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
 
     @require_torch
@@ -216,8 +193,7 @@ def test_unstructured_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -228,7 +204,6 @@ def test_unstructured_kwargs(self):
             images=image_input,
             return_tensors="pt",
             crop_size={"height": 214, "width": 214},
-            size={"height": 214, "width": 214},
             padding="max_length",
             max_length=76,
         )
@@ -243,8 +218,7 @@ def test_unstructured_kwargs_batched(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -255,10 +229,10 @@ def test_unstructured_kwargs_batched(self):
             images=image_input,
             return_tensors="pt",
             crop_size={"height": 214, "width": 214},
-            size={"height": 214, "width": 214},
             padding="longest",
             max_length=76,
         )
+
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
         self.assertEqual(len(inputs["input_ids"][0]), 6)
@@ -270,8 +244,7 @@ def test_doubly_passed_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -292,8 +265,7 @@ def test_structured_kwargs_nested(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
 
@@ -303,7 +275,7 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -322,8 +294,7 @@ def test_structured_kwargs_nested_from_dict(self):
 
         image_processor = self.get_component("image_processor")
         tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
+
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
@@ -332,7 +303,7 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
@@ -341,133 +312,6 @@ def test_structured_kwargs_nested_from_dict(self):
 
         self.assertEqual(len(inputs["input_ids"][0]), 76)
 
-    #  text + audio kwargs testing
-    @require_torch
-    def test_tokenizer_defaults_preserved_by_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-        feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer(max_length=117, padding="max_length")
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        raw_speech = floats_list((3, 1000))
-        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt")
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 117)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 117)
-
-    @require_torch
-    def test_kwargs_overrides_default_tokenizer_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-        feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer(max_length=117)
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        raw_speech = floats_list((3, 1000))
-        inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length")
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 112)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 112)
-
-    @require_torch
-    def test_unstructured_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-        feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer(max_length=117)
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer", max_length=117)
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        raw_speech = floats_list((3, 1000))
-        inputs = processor(
-            text=input_str,
-            audio=raw_speech,
-            return_tensors="pt",
-            padding="max_length",
-            max_length=76,
-        )
-
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 76)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 76)
-
-    @require_torch
-    def test_doubly_passed_kwargs_audio(self):
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-        feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer()
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer"]
-        raw_speech = floats_list((3, 1000))
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                audio=raw_speech,
-                audio_kwargs={"padding": "max_length"},
-                padding="max_length",
-            )
-
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_audio_nested(self):
-        if "feature_extractor" not in self.processor_class.attributes:
-            self.skipTest(f"feature_extractor attribute not present in {self.processor_class}")
-        feature_extractor = self.get_component("feature_extractor")
-        if hasattr(self, "get_tokenizer"):
-            tokenizer = self.get_tokenizer()
-        elif hasattr(self, "get_component"):
-            tokenizer = self.get_component("tokenizer")
-        if not tokenizer.pad_token:
-            tokenizer.pad_token = "[TEST_PAD]"
-        processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer"]
-        raw_speech = floats_list((3, 1000))
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "audio_kwargs": {"padding": "max_length", "max_length": 66},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, audio=raw_speech, **all_kwargs)
-        if "input_ids" in inputs:
-            self.assertEqual(len(inputs["input_ids"][0]), 76)
-        elif "labels" in inputs:
-            self.assertEqual(len(inputs["labels"][0]), 76)
-
 
 class MyProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]

From 86722b43d1577e9e9f697e6186083851aa9001d6 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 29 Jul 2024 13:09:27 +0000
Subject: [PATCH 17/21] add unsync PR

---
 .../processing_grounding_dino.py              | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index a09d21502b3c07..444b29085b0142 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,13 +16,14 @@
 Processor class for Grounding DINO.
 """
 
+import pathlib
 import sys
-from typing import List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
 from ...image_utils import ImageInput
-from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 
 
 if sys.version_info >= (3, 11):
@@ -31,13 +32,21 @@
     from typing_extensions import Unpack
 
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
-from ...utils import TensorType, is_torch_available
+from ...utils import ExplicitEnum, TensorType, is_torch_available
 
 
 if is_torch_available():
     import torch
 
 
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
+
 def get_phrases_from_posmap(posmaps, input_ids):
     """Get token ids of phrases from posmaps and input_ids.
 
@@ -64,7 +73,16 @@ def get_phrases_from_posmap(posmaps, input_ids):
     return token_ids
 
 
+class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
+    annotations: Optional[Union[AnnotationType, List[AnnotationType]]]
+    return_segmentation_masks: Optional[bool]
+    masks_path: Optional[Union[str, pathlib.Path]]
+    do_convert_annotations: Optional[bool]
+    format: Optional[Union[str, AnnotationFormat]]
+
+
 class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: GroundingDinoImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
@@ -76,8 +94,7 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
             "return_token_type_ids": True,
             "return_length": False,
             "verbose": True,
-        },
-        "images_kwargs": {},
+        }
     }
 
 

From 8baa8e080eb0724d831341f22d25fd5607ff4b76 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 29 Jul 2024 13:25:34 +0000
Subject: [PATCH 18/21] revert

---
 tests/test_processing_common.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index bb4d86d3f5a500..074aa2f1d62545 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -250,13 +250,13 @@ def test_doubly_passed_kwargs(self):
 
         input_str = ["lower newer"]
         image_input = self.prepare_image_inputs()
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            images_kwargs={"size": {"height": 222, "width": 222}},
-            size={"height": 35, "width": 35},
-        )
-        self.assertEqual(inputs["pixel_values"][0].shape[2], 35)
+        with self.assertRaises(ValueError):
+            _ = processor(
+                text=input_str,
+                images=image_input,
+                images_kwargs={"crop_size": {"height": 222, "width": 222}},
+                crop_size={"height": 214, "width": 214},
+            )
 
     @require_torch
     @require_vision

From 39f28afc08d3d4a371112c830acdd5a73d71e87b Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 5 Aug 2024 06:54:07 +0000
Subject: [PATCH 19/21] make CI happy

---
 .../test_processor_grounding_dino.py          | 157 ++++++++++++++++++
 1 file changed, 157 insertions(+)

diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 448aa8f7fb6433..32c61e407df027 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -22,6 +22,7 @@
 import pytest
 
 from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor
+from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
@@ -80,6 +81,17 @@ def setUp(self):
         self.embed_dim = 5
         self.seq_length = 5
 
+    def get_component(self, attribute, **kwargs):
+        assert attribute in self.processor_class.attributes
+        component_class_name = getattr(self.processor_class, f"{attribute}_class")
+        if isinstance(component_class_name, tuple):
+            component_class_name = component_class_name[0]
+
+        component_class = processor_class_from_name(component_class_name)
+        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
+
+        return component
+
     # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@@ -264,6 +276,151 @@ def test_model_input_names(self):
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
 
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", size={"height": 234, "width": 234})
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
+
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
+        self.assertEqual(len(inputs["input_ids"][0]), 4)
+
+    @require_vision
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["input_ids"][0]), 4)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117)
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, size=[224, 224])
+        self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            size={"height": 214, "width": 214},
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):

From 7366aab07281f4792a5d85626efcf65b27b62a78 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Mon, 5 Aug 2024 14:12:19 +0000
Subject: [PATCH 20/21] nit

---
 .../test_processor_grounding_dino.py          | 22 +++++--------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py
index 32c61e407df027..c0bb186b392eb0 100644
--- a/tests/models/grounding_dino/test_processor_grounding_dino.py
+++ b/tests/models/grounding_dino/test_processor_grounding_dino.py
@@ -22,7 +22,6 @@
 import pytest
 
 from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor
-from transformers.models.auto.processing_auto import processor_class_from_name
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
@@ -81,17 +80,6 @@ def setUp(self):
         self.embed_dim = 5
         self.seq_length = 5
 
-    def get_component(self, attribute, **kwargs):
-        assert attribute in self.processor_class.attributes
-        component_class_name = getattr(self.processor_class, f"{attribute}_class")
-        if isinstance(component_class_name, tuple):
-            component_class_name = component_class_name[0]
-
-        component_class = processor_class_from_name(component_class_name)
-        component = component_class.from_pretrained(self.tmpdirname, **kwargs)  # noqa
-
-        return component
-
     # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert
     def get_tokenizer(self, **kwargs):
         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@@ -306,8 +294,10 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
-        self.assertEqual(len(inputs["input_ids"][0]), 4)
+        inputs = processor(
+            text=input_str, images=image_input, return_tensors="pt", padding="max_length", max_length=112
+        )
+        self.assertEqual(len(inputs["input_ids"][0]), 112)
 
     @require_vision
     @require_torch
@@ -322,8 +312,8 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs["input_ids"][0]), 4)
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt", padding="max_length")
+        self.assertEqual(len(inputs["input_ids"][0]), 117)
 
     @require_torch
     @require_vision

From 64839abcb2ab0424f1b8f83a90f184a3b98ddfe8 Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 7 Aug 2024 22:44:18 +0000
Subject: [PATCH 21/21] import annotationformat

---
 .../models/grounding_dino/processing_grounding_dino.py   | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 444b29085b0142..00c183338be056 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -22,7 +22,7 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_transforms import center_to_corners_format
-from ...image_utils import ImageInput
+from ...image_utils import AnnotationFormat, ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 
 
@@ -32,7 +32,7 @@
     from typing_extensions import Unpack
 
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
-from ...utils import ExplicitEnum, TensorType, is_torch_available
+from ...utils import TensorType, is_torch_available
 
 
 if is_torch_available():
@@ -42,11 +42,6 @@
 AnnotationType = Dict[str, Union[int, str, List[Dict]]]
 
 
-class AnnotationFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
 def get_phrases_from_posmap(posmaps, input_ids):
     """Get token ids of phrases from posmaps and input_ids.