From 51c201ac45fa0ebd91791f558987f5d6bc4f0ca0 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Mon, 15 Jul 2024 07:19:58 +0000 Subject: [PATCH 01/21] fix typo --- docs/source/en/model_doc/grounding-dino.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index d258f492abf8b5..609572a2393f01 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -45,19 +45,19 @@ import requests import torch from PIL import Image -from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, +from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection model_id = "IDEA-Research/grounding-dino-tiny" processor = AutoProcessor.from_pretrained(model_id) -model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) +model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id) image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(image_url, stream=True).raw) # Check for cats and remote controls text = "a cat. a remote control." -inputs = processor(images=image, text=text, return_tensors="pt").to(device) +inputs = processor(images=image, text=text, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) From 9dd38e20bcebfa3f6cdf812f79f4a50fe0104363 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Mon, 15 Jul 2024 07:20:07 +0000 Subject: [PATCH 02/21] uniform kwargs --- .../processing_grounding_dino.py | 74 +++++++++++-------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 44b99811d931ce..982b82b37ffe5f 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -16,12 +16,20 @@ Processor class for Grounding DINO. """ +import sys from typing import List, Optional, Tuple, Union from ...image_processing_utils import BatchFeature from ...image_transforms import center_to_corners_format from ...image_utils import ImageInput -from ...processing_utils import ProcessorMixin +from ...processing_utils import ProcessingKwargs, ProcessorMixin + + +if sys.version_info >= (3, 11): + from typing import Unpack +else: + from typing_extensions import Unpack + from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy from ...utils import TensorType, is_torch_available @@ -56,6 +64,26 @@ def get_phrases_from_posmap(posmaps, input_ids): return token_ids +class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "truncation": None, + "max_length": None, + "stride": 0, + "pad_to_multiple_of": None, + "return_attention_mask": None, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_token_type_ids": True, + "return_length": False, + "verbose": True, + } + } + + class GroundingDinoProcessor(ProcessorMixin): r""" Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a @@ -83,21 +111,8 @@ def __call__( self, images: ImageInput = None, text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - add_special_tokens: bool = True, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length: Optional[int] = None, - stride: int = 0, - pad_to_multiple_of: Optional[int] = None, - return_attention_mask: Optional[bool] = None, - return_overflowing_tokens: bool = False, - return_special_tokens_mask: bool = False, - return_offsets_mapping: bool = False, - return_token_type_ids: bool = True, - return_length: bool = False, - verbose: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs, + **kwargs: Unpack[GroundingDinoProcessorKwargs], ) -> BatchEncoding: """ This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and @@ -108,30 +123,29 @@ def __call__( if images is None and text is None: raise ValueError("You have to specify either images or text.") + output_kwargs = self._merge_kwargs( + GroundingDinoProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + # BC for explicit return_tensors + if "return_tensors" in output_kwargs["common_kwargs"]: + return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None) + # Get only text if images is not None: - encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) + encoding_image_processor = self.image_processor( + images, return_tensors=return_tensors, **output_kwargs["images_kwargs"] + ) else: encoding_image_processor = BatchFeature() if text is not None: text_encoding = self.tokenizer( text=text, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - pad_to_multiple_of=pad_to_multiple_of, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_token_type_ids=return_token_type_ids, - return_length=return_length, - verbose=verbose, return_tensors=return_tensors, - **kwargs, + **output_kwargs["text_kwargs"], ) else: text_encoding = BatchEncoding() From ebc386282dccc725a1a7be9ae1edab25eda1a977 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Mon, 15 Jul 2024 07:24:45 +0000 Subject: [PATCH 03/21] make style --- .../models/grounding_dino/processing_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 982b82b37ffe5f..1020820efb8050 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -30,7 +30,7 @@ else: from typing_extensions import Unpack -from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import TensorType, is_torch_available From c6dc44528b39f8f58f072d2db912844ec501b9a3 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Mon, 15 Jul 2024 13:05:22 +0000 Subject: [PATCH 04/21] add comments --- .../processing_grounding_dino.py | 17 +++++----- src/transformers/processing_utils.py | 28 ++++++++++++++++ .../test_processor_grounding_dino.py | 33 ++++++++++++++++++- 3 files changed, 69 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 1020820efb8050..fe80864f8e0d09 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -17,7 +17,7 @@ """ import sys -from typing import List, Optional, Tuple, Union +from typing import List, Tuple, Union from ...image_processing_utils import BatchFeature from ...image_transforms import center_to_corners_format @@ -69,18 +69,18 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): "text_kwargs": { "add_special_tokens": True, "padding": False, - "truncation": None, - "max_length": None, "stride": 0, - "pad_to_multiple_of": None, - "return_attention_mask": None, "return_overflowing_tokens": False, "return_special_tokens_mask": False, "return_offsets_mapping": False, - "return_token_type_ids": True, + "return_token_type_ids": False, "return_length": False, "verbose": True, - } + }, + "images_kwargs": { + "do_convert_annotations": True, + "do_resize": True, + }, } @@ -111,7 +111,8 @@ def __call__( self, images: ImageInput = None, text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - return_tensors: Optional[Union[str, TensorType]] = None, + audio=None, + videos=None, **kwargs: Unpack[GroundingDinoProcessorKwargs], ) -> BatchEncoding: """ diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 7062a7699a79f7..2e724eb2264a0a 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -20,6 +20,7 @@ import inspect import json import os +import pathlib import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union @@ -40,6 +41,7 @@ ) from .utils import ( PROCESSOR_NAME, + ExplicitEnum, PushToHubMixin, TensorType, add_model_info_to_auto_map, @@ -56,6 +58,14 @@ logger = logging.get_logger(__name__) +AnnotationType = Dict[str, Union[int, str, List[Dict]]] + + +class AnnotationFormat(ExplicitEnum): + COCO_DETECTION = "coco_detection" + COCO_PANOPTIC = "coco_panoptic" + + # Dynamically import the Transformers module to grab the attribute classes of the processor form their names. transformers_module = direct_transformers_import(Path(__file__).parent) @@ -128,6 +138,12 @@ class ImagesKwargs(TypedDict, total=False): class methods and docstrings. Attributes: + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. + return_segmentation_masks (`bool`, *optional*): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. do_resize (`bool`, *optional*): Whether to resize the image. size (`Dict[str, int]`, *optional*): @@ -144,6 +160,8 @@ class methods and docstrings. Scale factor to use if rescaling the image. do_normalize (`bool`, *optional*): Whether to normalize the image. + do_convert_annotations (`bool`, *optional*): + Whether to convert the annotations to the format expected by the model. image_mean (`float` or `List[float]`, *optional*): Mean to use if normalizing the image. image_std (`float` or `List[float]`, *optional*): @@ -152,12 +170,19 @@ class methods and docstrings. Whether to pad the image to the `(max_height, max_width)` of the images in the batch. do_center_crop (`bool`, *optional*): Whether to center crop the image. + format (`str` or `AnnotationFormat`, *optional*): + Format of the annotations. data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the output image. input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the input image. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. """ + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] + return_segmentation_masks: Optional[bool] + masks_path: Optional[Union[str, pathlib.Path]] do_resize: Optional[bool] size: Optional[Dict[str, int]] size_divisor: Optional[int] @@ -166,12 +191,15 @@ class methods and docstrings. do_rescale: Optional[bool] rescale_factor: Optional[float] do_normalize: Optional[bool] + do_convert_annotations: Optional[bool] image_mean: Optional[Union[float, List[float]]] image_std: Optional[Union[float, List[float]]] do_pad: Optional[bool] do_center_crop: Optional[bool] + format: Optional[Union[str, AnnotationFormat]] data_format: Optional[ChannelDimension] input_data_format: Optional[Union[str, ChannelDimension]] + pad_size: Optional[Dict[str, int]] class VideosKwargs(TypedDict, total=False): diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index a788d09ca7eed1..b7a259f0c31526 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -26,6 +26,8 @@ from transformers.testing_utils import require_torch, require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available +from ...test_processing_common import ProcessorTesterMixin + if is_torch_available(): import torch @@ -40,7 +42,9 @@ @require_torch @require_vision -class GroundingDinoProcessorTest(unittest.TestCase): +class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = GroundingDinoProcessor + def setUp(self): self.tmpdirname = tempfile.mkdtemp() @@ -251,3 +255,30 @@ def test_model_input_names(self): inputs = processor(text=input_str, images=image_input) self.assertListEqual(list(inputs.keys()), processor.model_input_names) + + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = ["lower newer", "upper older longer string"] + image_input = self.prepare_image_inputs() * 2 + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + crop_size={"height": 214, "width": 214}, + size={"height": 214, "width": 214}, + padding="longest", + max_length=76, + ) + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 11) From 16ddefd33d42be09015f6e06916d1f170ed050ac Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Tue, 16 Jul 2024 01:00:01 +0000 Subject: [PATCH 05/21] remove return_tensors --- .../models/grounding_dino/processing_grounding_dino.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index fe80864f8e0d09..4aa081fd70f224 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -122,7 +122,7 @@ def __call__( Please refer to the docstring of the above two methods for more information. """ if images is None and text is None: - raise ValueError("You have to specify either images or text.") + raise ValueError("You must specify either text or images.") output_kwargs = self._merge_kwargs( GroundingDinoProcessorKwargs, @@ -130,14 +130,10 @@ def __call__( **kwargs, ) - # BC for explicit return_tensors - if "return_tensors" in output_kwargs["common_kwargs"]: - return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None) - # Get only text if images is not None: encoding_image_processor = self.image_processor( - images, return_tensors=return_tensors, **output_kwargs["images_kwargs"] + images, **output_kwargs["common_kwargs"], **output_kwargs["images_kwargs"] ) else: encoding_image_processor = BatchFeature() @@ -145,7 +141,7 @@ def __call__( if text is not None: text_encoding = self.tokenizer( text=text, - return_tensors=return_tensors, + **output_kwargs["common_kwargs"], **output_kwargs["text_kwargs"], ) else: From 1f9a0eeb22e07fd6f0a8a904146e7186e4938c95 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Tue, 23 Jul 2024 01:39:42 +0000 Subject: [PATCH 06/21] remove common_kwargs from processor since it propagates --- .../models/grounding_dino/processing_grounding_dino.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 4aa081fd70f224..c005f2d031087c 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -133,7 +133,7 @@ def __call__( # Get only text if images is not None: encoding_image_processor = self.image_processor( - images, **output_kwargs["common_kwargs"], **output_kwargs["images_kwargs"] + images, **output_kwargs["images_kwargs"] ) else: encoding_image_processor = BatchFeature() @@ -141,7 +141,6 @@ def __call__( if text is not None: text_encoding = self.tokenizer( text=text, - **output_kwargs["common_kwargs"], **output_kwargs["text_kwargs"], ) else: From 0696dcf60720e8aa9d94c519c3f3da14d5820bb2 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Tue, 23 Jul 2024 01:51:09 +0000 Subject: [PATCH 07/21] make style --- .../models/grounding_dino/processing_grounding_dino.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index c005f2d031087c..9eccc7320ccc6b 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -132,9 +132,7 @@ def __call__( # Get only text if images is not None: - encoding_image_processor = self.image_processor( - images, **output_kwargs["images_kwargs"] - ) + encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"]) else: encoding_image_processor = BatchFeature() From 850b9d5f6ea7224d6afd17311a20880d86f144d9 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Tue, 23 Jul 2024 02:06:37 +0000 Subject: [PATCH 08/21] return_token_type_ids to True --- .../models/grounding_dino/processing_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 9eccc7320ccc6b..74cda0c5953915 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -73,7 +73,7 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): "return_overflowing_tokens": False, "return_special_tokens_mask": False, "return_offsets_mapping": False, - "return_token_type_ids": False, + "return_token_type_ids": True, "return_length": False, "verbose": True, }, From c96c02b6ecdb0047c1ec19772244042ea3086831 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Wed, 24 Jul 2024 00:46:39 +0000 Subject: [PATCH 09/21] revert the default imagekwargs since does not accept any value in the image processro --- .../models/grounding_dino/processing_grounding_dino.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 74cda0c5953915..9928eacbeb5ce0 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -78,8 +78,6 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): "verbose": True, }, "images_kwargs": { - "do_convert_annotations": True, - "do_resize": True, }, } From 8cff6b609438691cb4cbc1195bd6f058d8794e4b Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Wed, 24 Jul 2024 01:08:01 +0000 Subject: [PATCH 10/21] revert processing_utils.py --- src/transformers/processing_utils.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 2e724eb2264a0a..7062a7699a79f7 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -20,7 +20,6 @@ import inspect import json import os -import pathlib import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union @@ -41,7 +40,6 @@ ) from .utils import ( PROCESSOR_NAME, - ExplicitEnum, PushToHubMixin, TensorType, add_model_info_to_auto_map, @@ -58,14 +56,6 @@ logger = logging.get_logger(__name__) -AnnotationType = Dict[str, Union[int, str, List[Dict]]] - - -class AnnotationFormat(ExplicitEnum): - COCO_DETECTION = "coco_detection" - COCO_PANOPTIC = "coco_panoptic" - - # Dynamically import the Transformers module to grab the attribute classes of the processor form their names. transformers_module = direct_transformers_import(Path(__file__).parent) @@ -138,12 +128,6 @@ class ImagesKwargs(TypedDict, total=False): class methods and docstrings. Attributes: - annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): - List of annotations associated with the image or batch of images. - return_segmentation_masks (`bool`, *optional*): - Whether to return segmentation masks. - masks_path (`str` or `pathlib.Path`, *optional*): - Path to the directory containing the segmentation masks. do_resize (`bool`, *optional*): Whether to resize the image. size (`Dict[str, int]`, *optional*): @@ -160,8 +144,6 @@ class methods and docstrings. Scale factor to use if rescaling the image. do_normalize (`bool`, *optional*): Whether to normalize the image. - do_convert_annotations (`bool`, *optional*): - Whether to convert the annotations to the format expected by the model. image_mean (`float` or `List[float]`, *optional*): Mean to use if normalizing the image. image_std (`float` or `List[float]`, *optional*): @@ -170,19 +152,12 @@ class methods and docstrings. Whether to pad the image to the `(max_height, max_width)` of the images in the batch. do_center_crop (`bool`, *optional*): Whether to center crop the image. - format (`str` or `AnnotationFormat`, *optional*): - Format of the annotations. data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the output image. input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the input image. - pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. """ - annotations: Optional[Union[AnnotationType, List[AnnotationType]]] - return_segmentation_masks: Optional[bool] - masks_path: Optional[Union[str, pathlib.Path]] do_resize: Optional[bool] size: Optional[Dict[str, int]] size_divisor: Optional[int] @@ -191,15 +166,12 @@ class methods and docstrings. do_rescale: Optional[bool] rescale_factor: Optional[float] do_normalize: Optional[bool] - do_convert_annotations: Optional[bool] image_mean: Optional[Union[float, List[float]]] image_std: Optional[Union[float, List[float]]] do_pad: Optional[bool] do_center_crop: Optional[bool] - format: Optional[Union[str, AnnotationFormat]] data_format: Optional[ChannelDimension] input_data_format: Optional[Union[str, ChannelDimension]] - pad_size: Optional[Dict[str, int]] class VideosKwargs(TypedDict, total=False): From bb1f18bb3bb9230dd3acfa5349328daf749c5fbe Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Wed, 24 Jul 2024 01:25:14 +0000 Subject: [PATCH 11/21] make style --- .../models/grounding_dino/processing_grounding_dino.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 9928eacbeb5ce0..a09d21502b3c07 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -77,8 +77,7 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): "return_length": False, "verbose": True, }, - "images_kwargs": { - }, + "images_kwargs": {}, } From a476c6ee88318ce40d73ea31e2dc2d4faa8ae410 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Wed, 24 Jul 2024 01:47:47 +0000 Subject: [PATCH 12/21] add molbap's commit --- .../processing_grounding_dino.py | 23 +- src/transformers/processing_utils.py | 71 +++---- tests/test_processing_common.py | 200 ++++++++++++++++-- 3 files changed, 225 insertions(+), 69 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index a09d21502b3c07..167b5598bfa7b9 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -16,13 +16,14 @@ Processor class for Grounding DINO. """ +import pathlib import sys -from typing import List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union from ...image_processing_utils import BatchFeature from ...image_transforms import center_to_corners_format from ...image_utils import ImageInput -from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin if sys.version_info >= (3, 11): @@ -31,12 +32,19 @@ from typing_extensions import Unpack from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput -from ...utils import TensorType, is_torch_available +from ...utils import ExplicitEnum, TensorType, is_torch_available if is_torch_available(): import torch +AnnotationType = Dict[str, Union[int, str, List[Dict]]] + + +class AnnotationFormat(ExplicitEnum): + COCO_DETECTION = "coco_detection" + COCO_PANOPTIC = "coco_panoptic" + def get_phrases_from_posmap(posmaps, input_ids): """Get token ids of phrases from posmaps and input_ids. @@ -64,7 +72,16 @@ def get_phrases_from_posmap(posmaps, input_ids): return token_ids +class GroundingDinoImagesKwargs(ImagesKwargs, total=False): + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] + return_segmentation_masks: Optional[bool] + masks_path: Optional[Union[str, pathlib.Path]] + do_convert_annotations: Optional[bool] + format: Optional[Union[str, AnnotationFormat]] + + class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: GroundingDinoImagesKwargs _defaults = { "text_kwargs": { "add_special_tokens": True, diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 7062a7699a79f7..d9f1e6f5efabde 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -150,6 +150,8 @@ class methods and docstrings. Standard deviation to use if normalizing the image. do_pad (`bool`, *optional*): Whether to pad the image to the `(max_height, max_width)` of the images in the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. do_center_crop (`bool`, *optional*): Whether to center crop the image. data_format (`ChannelDimension` or `str`, *optional*): @@ -169,6 +171,7 @@ class methods and docstrings. image_mean: Optional[Union[float, List[float]]] image_std: Optional[Union[float, List[float]]] do_pad: Optional[bool] + pad_size: Optional[Dict[str, int]] do_center_crop: Optional[bool] data_format: Optional[ChannelDimension] input_data_format: Optional[Union[str, ChannelDimension]] @@ -320,7 +323,6 @@ class ProcessorMixin(PushToHubMixin): feature_extractor_class = None tokenizer_class = None _auto_class = None - valid_kwargs: List[str] = [] # args have to match the attributes class attribute def __init__(self, *args, **kwargs): @@ -649,15 +651,14 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs): processor_dict = processor_dict.copy() return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) - # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs - # If we don't pop, some specific kwargs will raise a warning + # Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`. + # We have to pop up some unused (but specific) arguments to make it work. if "processor_class" in processor_dict: del processor_dict["processor_class"] if "auto_map" in processor_dict: del processor_dict["auto_map"] - unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs) processor = cls(*args, **processor_dict) # Update processor with kwargs if needed @@ -665,7 +666,6 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs): if hasattr(processor, key): setattr(processor, key, kwargs.pop(key)) - kwargs.update(unused_kwargs) logger.info(f"Processor {processor}") if return_unused_kwargs: return processor, kwargs @@ -743,38 +743,34 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg if modality_key in tokenizer_init_kwargs: default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key] # now defaults kwargs are updated with the tokenizers defaults. - # pass defaults to output dictionary output_kwargs.update(default_kwargs) + # gather common kwargs and remove them from individual kwargs if present + common_kwargs = { + key: value + for key, value in kwargs.items() + if key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__ + and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__ + and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__ + and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__ + } + + # ensure common kwargs are propagated to all relevant modalities + for key, value in common_kwargs.items(): + for modality in output_kwargs: + if modality != "common_kwargs": + output_kwargs[modality][key] = value + + # remove common kwargs from the kwargs to process the rest + kwargs = {k: v for k, v in kwargs.items() if k not in common_kwargs} + # update modality kwargs with passed kwargs - non_modality_kwargs = set(kwargs) - set(output_kwargs) for modality in output_kwargs: for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys(): - # check if we received a structured kwarg dict or not to handle it correctly - if modality in kwargs: - kwarg_value = kwargs[modality].pop(modality_key, "__empty__") - # check if this key was passed as a flat kwarg. - if kwarg_value != "__empty__" and modality_key in non_modality_kwargs: - raise ValueError( - f"Keyword argument {modality_key} was passed two times: in a dictionary for {modality} and as a **kwarg." - ) + if modality in kwargs and modality_key in kwargs[modality]: + output_kwargs[modality][modality_key] = kwargs[modality][modality_key] elif modality_key in kwargs: - kwarg_value = kwargs.pop(modality_key, "__empty__") - else: - kwarg_value = "__empty__" - if kwarg_value != "__empty__": - output_kwargs[modality][modality_key] = kwarg_value - # if something remains in kwargs, it belongs to common after flattening - if set(kwargs) & set(default_kwargs): - # here kwargs is dictionary-based since it shares keys with default set - [output_kwargs["common_kwargs"].update(subdict) for _, subdict in kwargs.items()] - else: - # here it's a flat dict - output_kwargs["common_kwargs"].update(kwargs) - - # all modality-specific kwargs are updated with common kwargs - for modality in output_kwargs: - output_kwargs[modality].update(output_kwargs["common_kwargs"]) + output_kwargs[modality][modality_key] = kwargs[modality_key] return output_kwargs @classmethod @@ -890,19 +886,6 @@ def model_input_names(self): first_attribute = getattr(self, self.attributes[0]) return getattr(first_attribute, "model_input_names", None) - @staticmethod - def validate_init_kwargs(processor_config, valid_kwargs): - kwargs_from_config = processor_config.keys() - unused_kwargs = {} - unused_keys = set(kwargs_from_config) - set(valid_kwargs) - if unused_keys: - unused_key_str = ", ".join(unused_keys) - logger.warning( - f"Some kwargs in processor config are unused and will not have any effect: {unused_key_str}. " - ) - unused_kwargs = {k: processor_config[k] for k in unused_keys} - return unused_kwargs - def apply_chat_template( self, conversation: Union[List[Dict[str, str]]], diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 074aa2f1d62545..e6128cde9bb503 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -16,6 +16,7 @@ import inspect import json +import random import tempfile @@ -38,15 +39,31 @@ from transformers.utils import is_vision_available +global_rng = random.Random() + if is_vision_available(): from PIL import Image from transformers import CLIPImageProcessor +# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list +def floats_list(shape, scale=1.0, rng=None, name=None): + """Creates a random float32 tensor""" + if rng is None: + rng = global_rng + + values = [] + for batch_idx in range(shape[0]): + values.append([]) + for _ in range(shape[1]): + values[-1].append(rng.random() * scale) + + return values + + @require_torch @require_vision -@require_torch class ProcessorTesterMixin: processor_class = None @@ -60,7 +77,10 @@ def get_component(self, attribute, **kwargs): component_class_name = component_class_name[0] component_class = processor_class_from_name(component_class_name) - component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa + if hasattr(self, "tmpdirname"): + component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa + elif hasattr(self, "model_id"): + component = component_class.from_pretrained(self.model_id, **kwargs) # noqa return component @@ -126,13 +146,13 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=117) - + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, return_tensors="pt") self.assertEqual(len(inputs["input_ids"][0]), 117) @@ -141,15 +161,15 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): def test_image_processor_defaults_preserved_by_image_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", crop_size=(234, 234)) + image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234)) tokenizer = self.get_component("tokenizer", max_length=117) - + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input) self.assertEqual(len(inputs["pixel_values"][0][0]), 234) @@ -160,13 +180,15 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer", max_length=117) - + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112) + inputs = processor( + text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" + ) self.assertEqual(len(inputs["input_ids"][0]), 112) @require_torch @@ -174,16 +196,17 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): def test_kwargs_overrides_default_image_processor_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", crop_size=(234, 234)) + image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234)) tokenizer = self.get_component("tokenizer", max_length=117) - + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, crop_size=[224, 224]) + inputs = processor(text=input_str, images=image_input, crop_size=[224, 224], size=[224, 224]) self.assertEqual(len(inputs["pixel_values"][0][0]), 224) @require_torch @@ -193,7 +216,8 @@ def test_unstructured_kwargs(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") - + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) @@ -204,6 +228,7 @@ def test_unstructured_kwargs(self): images=image_input, return_tensors="pt", crop_size={"height": 214, "width": 214}, + size={"height": 214, "width": 214}, padding="max_length", max_length=76, ) @@ -218,7 +243,8 @@ def test_unstructured_kwargs_batched(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") - + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) @@ -229,10 +255,10 @@ def test_unstructured_kwargs_batched(self): images=image_input, return_tensors="pt", crop_size={"height": 214, "width": 214}, + size={"height": 214, "width": 214}, padding="longest", max_length=76, ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) self.assertEqual(len(inputs["input_ids"][0]), 6) @@ -244,7 +270,8 @@ def test_doubly_passed_kwargs(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") - + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) @@ -265,7 +292,8 @@ def test_structured_kwargs_nested(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") - + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) @@ -275,7 +303,7 @@ def test_structured_kwargs_nested(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, + "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } @@ -294,7 +322,8 @@ def test_structured_kwargs_nested_from_dict(self): image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") - + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" @@ -303,7 +332,7 @@ def test_structured_kwargs_nested_from_dict(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, + "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } @@ -312,6 +341,133 @@ def test_structured_kwargs_nested_from_dict(self): self.assertEqual(len(inputs["input_ids"][0]), 76) + # text + audio kwargs testing + @require_torch + def test_tokenizer_defaults_preserved_by_kwargs_audio(self): + if "feature_extractor" not in self.processor_class.attributes: + self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") + feature_extractor = self.get_component("feature_extractor") + if hasattr(self, "get_tokenizer"): + tokenizer = self.get_tokenizer(max_length=117, padding="max_length") + elif hasattr(self, "get_component"): + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + raw_speech = floats_list((3, 1000)) + inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt") + if "input_ids" in inputs: + self.assertEqual(len(inputs["input_ids"][0]), 117) + elif "labels" in inputs: + self.assertEqual(len(inputs["labels"][0]), 117) + + @require_torch + def test_kwargs_overrides_default_tokenizer_kwargs_audio(self): + if "feature_extractor" not in self.processor_class.attributes: + self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") + feature_extractor = self.get_component("feature_extractor") + if hasattr(self, "get_tokenizer"): + tokenizer = self.get_tokenizer(max_length=117) + elif hasattr(self, "get_component"): + tokenizer = self.get_component("tokenizer", max_length=117) + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + raw_speech = floats_list((3, 1000)) + inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length") + if "input_ids" in inputs: + self.assertEqual(len(inputs["input_ids"][0]), 112) + elif "labels" in inputs: + self.assertEqual(len(inputs["labels"][0]), 112) + + @require_torch + def test_unstructured_kwargs_audio(self): + if "feature_extractor" not in self.processor_class.attributes: + self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") + feature_extractor = self.get_component("feature_extractor") + if hasattr(self, "get_tokenizer"): + tokenizer = self.get_tokenizer(max_length=117) + elif hasattr(self, "get_component"): + tokenizer = self.get_component("tokenizer", max_length=117) + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + raw_speech = floats_list((3, 1000)) + inputs = processor( + text=input_str, + audio=raw_speech, + return_tensors="pt", + padding="max_length", + max_length=76, + ) + + if "input_ids" in inputs: + self.assertEqual(len(inputs["input_ids"][0]), 76) + elif "labels" in inputs: + self.assertEqual(len(inputs["labels"][0]), 76) + + @require_torch + def test_doubly_passed_kwargs_audio(self): + if "feature_extractor" not in self.processor_class.attributes: + self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") + feature_extractor = self.get_component("feature_extractor") + if hasattr(self, "get_tokenizer"): + tokenizer = self.get_tokenizer() + elif hasattr(self, "get_component"): + tokenizer = self.get_component("tokenizer") + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = ["lower newer"] + raw_speech = floats_list((3, 1000)) + with self.assertRaises(ValueError): + _ = processor( + text=input_str, + audio=raw_speech, + audio_kwargs={"padding": "max_length"}, + padding="max_length", + ) + + @require_torch + @require_vision + def test_structured_kwargs_audio_nested(self): + if "feature_extractor" not in self.processor_class.attributes: + self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") + feature_extractor = self.get_component("feature_extractor") + if hasattr(self, "get_tokenizer"): + tokenizer = self.get_tokenizer() + elif hasattr(self, "get_component"): + tokenizer = self.get_component("tokenizer") + if not tokenizer.pad_token: + tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = ["lower newer"] + raw_speech = floats_list((3, 1000)) + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "audio_kwargs": {"padding": "max_length", "max_length": 66}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, audio=raw_speech, **all_kwargs) + if "input_ids" in inputs: + self.assertEqual(len(inputs["input_ids"][0]), 76) + elif "labels" in inputs: + self.assertEqual(len(inputs["labels"][0]), 76) + class MyProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] From 81045219a22ed34a73cac3fe924908e57a9ebe53 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Wed, 24 Jul 2024 02:40:01 +0000 Subject: [PATCH 13/21] fix typo --- .../models/grounding_dino/modeling_grounding_dino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index dcdccc50cc116d..c33718bde54410 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -1580,7 +1580,7 @@ def _set_gradient_checkpointing(self, module, value=False): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it. - Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDinoTokenizer.__call__`] for details. + Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details. token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*): Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, From 5d6a088566b743d64defbd6bc55cdf00cbbe985e Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Wed, 24 Jul 2024 12:28:55 +0000 Subject: [PATCH 14/21] fix common processor --- src/transformers/processing_utils.py | 35 +++++++++++-------- .../test_processor_grounding_dino.py | 10 +++++- tests/test_processing_common.py | 14 ++++---- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 8e0ab968d8589a..83ad01714dbceb 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -736,12 +736,12 @@ def _merge_kwargs( The order of operations is as follows: 1) kwargs passed as before have highest priority to preserve BC. ```python - high_priority_kwargs = {"crop_size" = (224, 224), "padding" = "max_length"} + high_priority_kwargs = {"crop_size" = {"height": 222, "width": 222}, "padding" = "max_length"} processor(..., **high_priority_kwargs) ``` 2) kwargs passed as modality-specific kwargs have second priority. This is the recommended API. ```python - processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": (224, 224)}}) + processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": {"height": 222, "width": 222}}}) ``` 3) kwargs passed during instantiation of a modality processor have fourth priority. ```python @@ -799,14 +799,20 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg output_kwargs.update(default_kwargs) # gather common kwargs and remove them from individual kwargs if present - common_kwargs = { - key: value - for key, value in kwargs.items() - if key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__ - and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__ - and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__ - and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__ - } + common_kwargs = {} + for key, value in kwargs.items(): + if key == "common_kwargs": + for common_key, common_value in value.items(): + common_kwargs[common_key] = common_value + elif key in ["text_kwargs", "images_kwargs", "audio_kwargs", "videos_kwargs"]: + pass + elif ( + key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__ + and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__ + and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__ + and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__ + ): + common_kwargs[key] = value # ensure common kwargs are propagated to all relevant modalities for key, value in common_kwargs.items(): @@ -820,10 +826,10 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg # update modality kwargs with passed kwargs for modality in output_kwargs: for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys(): - if modality in kwargs and modality_key in kwargs[modality]: - output_kwargs[modality][modality_key] = kwargs[modality][modality_key] - elif modality_key in kwargs: + if modality_key in kwargs: output_kwargs[modality][modality_key] = kwargs[modality_key] + elif modality in kwargs and modality_key in kwargs[modality]: + output_kwargs[modality][modality_key] = kwargs[modality][modality_key] return output_kwargs @classmethod @@ -988,5 +994,4 @@ def apply_chat_template( ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub) if ProcessorMixin.push_to_hub.__doc__ is not None: ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format( - object="processor", object_class="AutoProcessor", object_files="processor files" - ) + object="processor", object_class="AutoProcessor", object_ \ No newline at end of file diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index b7a259f0c31526..448aa8f7fb6433 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -43,6 +43,7 @@ @require_torch @require_vision class GroundingDinoProcessorTest(ProcessorTesterMixin, unittest.TestCase): + from_pretrained_id = "IDEA-Research/grounding-dino-base" processor_class = GroundingDinoProcessor def setUp(self): @@ -67,6 +68,13 @@ def setUp(self): with open(self.image_processor_file, "w", encoding="utf-8") as fp: json.dump(image_processor_map, fp) + image_processor = GroundingDinoImageProcessor() + tokenizer = BertTokenizer.from_pretrained(self.from_pretrained_id) + + processor = GroundingDinoProcessor(image_processor, tokenizer) + + processor.save_pretrained(self.tmpdirname) + self.batch_size = 7 self.num_queries = 5 self.embed_dim = 5 @@ -281,4 +289,4 @@ def test_unstructured_kwargs_batched(self): ) self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(len(inputs["input_ids"][0]), 11) + self.assertEqual(len(inputs["input_ids"][0]), 6) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index e6128cde9bb503..b43d48e530b8ce 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -277,13 +277,13 @@ def test_doubly_passed_kwargs(self): input_str = ["lower newer"] image_input = self.prepare_image_inputs() - with self.assertRaises(ValueError): - _ = processor( - text=input_str, - images=image_input, - images_kwargs={"crop_size": {"height": 222, "width": 222}}, - crop_size={"height": 214, "width": 214}, - ) + inputs = processor( + text=input_str, + images=image_input, + images_kwargs={"size": {"height": 222, "width": 222}}, + size={"height": 35, "width": 35}, + ) + self.assertEqual(inputs["pixel_values"][0].shape[2], 35) @require_torch @require_vision From d5b13d2beb4cec5f617bdf6accf40241983c43cb Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Wed, 24 Jul 2024 12:29:47 +0000 Subject: [PATCH 15/21] remain --- src/transformers/processing_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 83ad01714dbceb..372e54e9e2776a 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -994,4 +994,5 @@ def apply_chat_template( ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub) if ProcessorMixin.push_to_hub.__doc__ is not None: ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format( - object="processor", object_class="AutoProcessor", object_ \ No newline at end of file + object="processor", object_class="AutoProcessor", object_files="processor files" + ) From 1cf9139ef8dad3c48db6f47f6f5c8ed6e351d79d Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Mon, 29 Jul 2024 12:27:59 +0000 Subject: [PATCH 16/21] Revert "add molbap's commit" This reverts commit a476c6ee88318ce40d73ea31e2dc2d4faa8ae410. --- .../processing_grounding_dino.py | 23 +- src/transformers/processing_utils.py | 79 ++++--- tests/test_processing_common.py | 200 ++---------------- 3 files changed, 70 insertions(+), 232 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 167b5598bfa7b9..a09d21502b3c07 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -16,14 +16,13 @@ Processor class for Grounding DINO. """ -import pathlib import sys -from typing import Dict, List, Optional, Tuple, Union +from typing import List, Tuple, Union from ...image_processing_utils import BatchFeature from ...image_transforms import center_to_corners_format from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin +from ...processing_utils import ProcessingKwargs, ProcessorMixin if sys.version_info >= (3, 11): @@ -32,19 +31,12 @@ from typing_extensions import Unpack from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput -from ...utils import ExplicitEnum, TensorType, is_torch_available +from ...utils import TensorType, is_torch_available if is_torch_available(): import torch -AnnotationType = Dict[str, Union[int, str, List[Dict]]] - - -class AnnotationFormat(ExplicitEnum): - COCO_DETECTION = "coco_detection" - COCO_PANOPTIC = "coco_panoptic" - def get_phrases_from_posmap(posmaps, input_ids): """Get token ids of phrases from posmaps and input_ids. @@ -72,16 +64,7 @@ def get_phrases_from_posmap(posmaps, input_ids): return token_ids -class GroundingDinoImagesKwargs(ImagesKwargs, total=False): - annotations: Optional[Union[AnnotationType, List[AnnotationType]]] - return_segmentation_masks: Optional[bool] - masks_path: Optional[Union[str, pathlib.Path]] - do_convert_annotations: Optional[bool] - format: Optional[Union[str, AnnotationFormat]] - - class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: GroundingDinoImagesKwargs _defaults = { "text_kwargs": { "add_special_tokens": True, diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 372e54e9e2776a..9abb4b29fcd7ff 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -151,8 +151,6 @@ class methods and docstrings. Standard deviation to use if normalizing the image. do_pad (`bool`, *optional*): Whether to pad the image to the `(max_height, max_width)` of the images in the batch. - pad_size (`Dict[str, int]`, *optional*): - The size `{"height": int, "width" int}` to pad the images to. do_center_crop (`bool`, *optional*): Whether to center crop the image. data_format (`ChannelDimension` or `str`, *optional*): @@ -172,7 +170,6 @@ class methods and docstrings. image_mean: Optional[Union[float, List[float]]] image_std: Optional[Union[float, List[float]]] do_pad: Optional[bool] - pad_size: Optional[Dict[str, int]] do_center_crop: Optional[bool] data_format: Optional[ChannelDimension] input_data_format: Optional[Union[str, ChannelDimension]] @@ -324,6 +321,7 @@ class ProcessorMixin(PushToHubMixin): feature_extractor_class = None tokenizer_class = None _auto_class = None + valid_kwargs: List[str] = [] # args have to match the attributes class attribute def __init__(self, *args, **kwargs): @@ -702,14 +700,15 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs): return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) chat_template = kwargs.pop("chat_template", None) - # Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`. - # We have to pop up some unused (but specific) arguments to make it work. + # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs + # If we don't pop, some specific kwargs will raise a warning if "processor_class" in processor_dict: del processor_dict["processor_class"] if "auto_map" in processor_dict: del processor_dict["auto_map"] + unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs) processor = cls(*args, **processor_dict) if chat_template is not None: setattr(processor, "chat_template", chat_template) @@ -719,6 +718,7 @@ def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs): if hasattr(processor, key): setattr(processor, key, kwargs.pop(key)) + kwargs.update(unused_kwargs) logger.info(f"Processor {processor}") if return_unused_kwargs: return processor, kwargs @@ -796,40 +796,38 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg if modality_key in tokenizer_init_kwargs: default_kwargs[modality][modality_key] = tokenizer_init_kwargs[modality_key] # now defaults kwargs are updated with the tokenizers defaults. + # pass defaults to output dictionary output_kwargs.update(default_kwargs) - # gather common kwargs and remove them from individual kwargs if present - common_kwargs = {} - for key, value in kwargs.items(): - if key == "common_kwargs": - for common_key, common_value in value.items(): - common_kwargs[common_key] = common_value - elif key in ["text_kwargs", "images_kwargs", "audio_kwargs", "videos_kwargs"]: - pass - elif ( - key not in ModelProcessorKwargs.__annotations__["text_kwargs"].__annotations__ - and key not in ModelProcessorKwargs.__annotations__["images_kwargs"].__annotations__ - and key not in ModelProcessorKwargs.__annotations__["audio_kwargs"].__annotations__ - and key not in ModelProcessorKwargs.__annotations__["videos_kwargs"].__annotations__ - ): - common_kwargs[key] = value - - # ensure common kwargs are propagated to all relevant modalities - for key, value in common_kwargs.items(): - for modality in output_kwargs: - if modality != "common_kwargs": - output_kwargs[modality][key] = value - - # remove common kwargs from the kwargs to process the rest - kwargs = {k: v for k, v in kwargs.items() if k not in common_kwargs} - # update modality kwargs with passed kwargs + non_modality_kwargs = set(kwargs) - set(output_kwargs) for modality in output_kwargs: for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys(): - if modality_key in kwargs: - output_kwargs[modality][modality_key] = kwargs[modality_key] - elif modality in kwargs and modality_key in kwargs[modality]: - output_kwargs[modality][modality_key] = kwargs[modality][modality_key] + # check if we received a structured kwarg dict or not to handle it correctly + if modality in kwargs: + kwarg_value = kwargs[modality].pop(modality_key, "__empty__") + # check if this key was passed as a flat kwarg. + if kwarg_value != "__empty__" and modality_key in non_modality_kwargs: + raise ValueError( + f"Keyword argument {modality_key} was passed two times: in a dictionary for {modality} and as a **kwarg." + ) + elif modality_key in kwargs: + kwarg_value = kwargs.pop(modality_key, "__empty__") + else: + kwarg_value = "__empty__" + if kwarg_value != "__empty__": + output_kwargs[modality][modality_key] = kwarg_value + # if something remains in kwargs, it belongs to common after flattening + if set(kwargs) & set(default_kwargs): + # here kwargs is dictionary-based since it shares keys with default set + [output_kwargs["common_kwargs"].update(subdict) for _, subdict in kwargs.items()] + else: + # here it's a flat dict + output_kwargs["common_kwargs"].update(kwargs) + + # all modality-specific kwargs are updated with common kwargs + for modality in output_kwargs: + output_kwargs[modality].update(output_kwargs["common_kwargs"]) return output_kwargs @classmethod @@ -945,6 +943,19 @@ def model_input_names(self): first_attribute = getattr(self, self.attributes[0]) return getattr(first_attribute, "model_input_names", None) + @staticmethod + def validate_init_kwargs(processor_config, valid_kwargs): + kwargs_from_config = processor_config.keys() + unused_kwargs = {} + unused_keys = set(kwargs_from_config) - set(valid_kwargs) + if unused_keys: + unused_key_str = ", ".join(unused_keys) + logger.warning( + f"Some kwargs in processor config are unused and will not have any effect: {unused_key_str}. " + ) + unused_kwargs = {k: processor_config[k] for k in unused_keys} + return unused_kwargs + def apply_chat_template( self, conversation: Union[List[Dict[str, str]]], diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index b43d48e530b8ce..bb4d86d3f5a500 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -16,7 +16,6 @@ import inspect import json -import random import tempfile @@ -39,31 +38,15 @@ from transformers.utils import is_vision_available -global_rng = random.Random() - if is_vision_available(): from PIL import Image from transformers import CLIPImageProcessor -# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list -def floats_list(shape, scale=1.0, rng=None, name=None): - """Creates a random float32 tensor""" - if rng is None: - rng = global_rng - - values = [] - for batch_idx in range(shape[0]): - values.append([]) - for _ in range(shape[1]): - values[-1].append(rng.random() * scale) - - return values - - @require_torch @require_vision +@require_torch class ProcessorTesterMixin: processor_class = None @@ -77,10 +60,7 @@ def get_component(self, attribute, **kwargs): component_class_name = component_class_name[0] component_class = processor_class_from_name(component_class_name) - if hasattr(self, "tmpdirname"): - component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa - elif hasattr(self, "model_id"): - component = component_class.from_pretrained(self.model_id, **kwargs) # noqa + component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa return component @@ -146,13 +126,13 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" + tokenizer = self.get_component("tokenizer", max_length=117) + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() + inputs = processor(text=input_str, images=image_input, return_tensors="pt") self.assertEqual(len(inputs["input_ids"][0]), 117) @@ -161,15 +141,15 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): def test_image_processor_defaults_preserved_by_image_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234)) + image_processor = self.get_component("image_processor", crop_size=(234, 234)) tokenizer = self.get_component("tokenizer", max_length=117) - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() + inputs = processor(text=input_str, images=image_input) self.assertEqual(len(inputs["pixel_values"][0][0]), 234) @@ -180,15 +160,13 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer", max_length=117) - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" - ) + + inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112) self.assertEqual(len(inputs["input_ids"][0]), 112) @require_torch @@ -196,17 +174,16 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): def test_kwargs_overrides_default_image_processor_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", crop_size=(234, 234), size=(234, 234)) + image_processor = self.get_component("image_processor", crop_size=(234, 234)) tokenizer = self.get_component("tokenizer", max_length=117) - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, crop_size=[224, 224], size=[224, 224]) + inputs = processor(text=input_str, images=image_input, crop_size=[224, 224]) self.assertEqual(len(inputs["pixel_values"][0][0]), 224) @require_torch @@ -216,8 +193,7 @@ def test_unstructured_kwargs(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) @@ -228,7 +204,6 @@ def test_unstructured_kwargs(self): images=image_input, return_tensors="pt", crop_size={"height": 214, "width": 214}, - size={"height": 214, "width": 214}, padding="max_length", max_length=76, ) @@ -243,8 +218,7 @@ def test_unstructured_kwargs_batched(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) @@ -255,10 +229,10 @@ def test_unstructured_kwargs_batched(self): images=image_input, return_tensors="pt", crop_size={"height": 214, "width": 214}, - size={"height": 214, "width": 214}, padding="longest", max_length=76, ) + self.assertEqual(inputs["pixel_values"].shape[2], 214) self.assertEqual(len(inputs["input_ids"][0]), 6) @@ -270,8 +244,7 @@ def test_doubly_passed_kwargs(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) @@ -292,8 +265,7 @@ def test_structured_kwargs_nested(self): self.skipTest(f"image_processor attribute not present in {self.processor_class}") image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) @@ -303,7 +275,7 @@ def test_structured_kwargs_nested(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}}, + "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } @@ -322,8 +294,7 @@ def test_structured_kwargs_nested_from_dict(self): image_processor = self.get_component("image_processor") tokenizer = self.get_component("tokenizer") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) input_str = "lower newer" @@ -332,7 +303,7 @@ def test_structured_kwargs_nested_from_dict(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"crop_size": {"height": 214, "width": 214}, "size": {"height": 214, "width": 214}}, + "images_kwargs": {"crop_size": {"height": 214, "width": 214}}, "text_kwargs": {"padding": "max_length", "max_length": 76}, } @@ -341,133 +312,6 @@ def test_structured_kwargs_nested_from_dict(self): self.assertEqual(len(inputs["input_ids"][0]), 76) - # text + audio kwargs testing - @require_torch - def test_tokenizer_defaults_preserved_by_kwargs_audio(self): - if "feature_extractor" not in self.processor_class.attributes: - self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") - feature_extractor = self.get_component("feature_extractor") - if hasattr(self, "get_tokenizer"): - tokenizer = self.get_tokenizer(max_length=117, padding="max_length") - elif hasattr(self, "get_component"): - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" - processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - raw_speech = floats_list((3, 1000)) - inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt") - if "input_ids" in inputs: - self.assertEqual(len(inputs["input_ids"][0]), 117) - elif "labels" in inputs: - self.assertEqual(len(inputs["labels"][0]), 117) - - @require_torch - def test_kwargs_overrides_default_tokenizer_kwargs_audio(self): - if "feature_extractor" not in self.processor_class.attributes: - self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") - feature_extractor = self.get_component("feature_extractor") - if hasattr(self, "get_tokenizer"): - tokenizer = self.get_tokenizer(max_length=117) - elif hasattr(self, "get_component"): - tokenizer = self.get_component("tokenizer", max_length=117) - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" - processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - raw_speech = floats_list((3, 1000)) - inputs = processor(text=input_str, audio=raw_speech, return_tensors="pt", max_length=112, padding="max_length") - if "input_ids" in inputs: - self.assertEqual(len(inputs["input_ids"][0]), 112) - elif "labels" in inputs: - self.assertEqual(len(inputs["labels"][0]), 112) - - @require_torch - def test_unstructured_kwargs_audio(self): - if "feature_extractor" not in self.processor_class.attributes: - self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") - feature_extractor = self.get_component("feature_extractor") - if hasattr(self, "get_tokenizer"): - tokenizer = self.get_tokenizer(max_length=117) - elif hasattr(self, "get_component"): - tokenizer = self.get_component("tokenizer", max_length=117) - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" - processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - raw_speech = floats_list((3, 1000)) - inputs = processor( - text=input_str, - audio=raw_speech, - return_tensors="pt", - padding="max_length", - max_length=76, - ) - - if "input_ids" in inputs: - self.assertEqual(len(inputs["input_ids"][0]), 76) - elif "labels" in inputs: - self.assertEqual(len(inputs["labels"][0]), 76) - - @require_torch - def test_doubly_passed_kwargs_audio(self): - if "feature_extractor" not in self.processor_class.attributes: - self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") - feature_extractor = self.get_component("feature_extractor") - if hasattr(self, "get_tokenizer"): - tokenizer = self.get_tokenizer() - elif hasattr(self, "get_component"): - tokenizer = self.get_component("tokenizer") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" - processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer"] - raw_speech = floats_list((3, 1000)) - with self.assertRaises(ValueError): - _ = processor( - text=input_str, - audio=raw_speech, - audio_kwargs={"padding": "max_length"}, - padding="max_length", - ) - - @require_torch - @require_vision - def test_structured_kwargs_audio_nested(self): - if "feature_extractor" not in self.processor_class.attributes: - self.skipTest(f"feature_extractor attribute not present in {self.processor_class}") - feature_extractor = self.get_component("feature_extractor") - if hasattr(self, "get_tokenizer"): - tokenizer = self.get_tokenizer() - elif hasattr(self, "get_component"): - tokenizer = self.get_component("tokenizer") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" - processor = self.processor_class(tokenizer=tokenizer, feature_extractor=feature_extractor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer"] - raw_speech = floats_list((3, 1000)) - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "audio_kwargs": {"padding": "max_length", "max_length": 66}, - "text_kwargs": {"padding": "max_length", "max_length": 76}, - } - - inputs = processor(text=input_str, audio=raw_speech, **all_kwargs) - if "input_ids" in inputs: - self.assertEqual(len(inputs["input_ids"][0]), 76) - elif "labels" in inputs: - self.assertEqual(len(inputs["labels"][0]), 76) - class MyProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] From 86722b43d1577e9e9f697e6186083851aa9001d6 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Mon, 29 Jul 2024 13:09:27 +0000 Subject: [PATCH 17/21] add unsync PR --- .../processing_grounding_dino.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index a09d21502b3c07..444b29085b0142 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -16,13 +16,14 @@ Processor class for Grounding DINO. """ +import pathlib import sys -from typing import List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union from ...image_processing_utils import BatchFeature from ...image_transforms import center_to_corners_format from ...image_utils import ImageInput -from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin if sys.version_info >= (3, 11): @@ -31,13 +32,21 @@ from typing_extensions import Unpack from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput -from ...utils import TensorType, is_torch_available +from ...utils import ExplicitEnum, TensorType, is_torch_available if is_torch_available(): import torch +AnnotationType = Dict[str, Union[int, str, List[Dict]]] + + +class AnnotationFormat(ExplicitEnum): + COCO_DETECTION = "coco_detection" + COCO_PANOPTIC = "coco_panoptic" + + def get_phrases_from_posmap(posmaps, input_ids): """Get token ids of phrases from posmaps and input_ids. @@ -64,7 +73,16 @@ def get_phrases_from_posmap(posmaps, input_ids): return token_ids +class GroundingDinoImagesKwargs(ImagesKwargs, total=False): + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] + return_segmentation_masks: Optional[bool] + masks_path: Optional[Union[str, pathlib.Path]] + do_convert_annotations: Optional[bool] + format: Optional[Union[str, AnnotationFormat]] + + class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: GroundingDinoImagesKwargs _defaults = { "text_kwargs": { "add_special_tokens": True, @@ -76,8 +94,7 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): "return_token_type_ids": True, "return_length": False, "verbose": True, - }, - "images_kwargs": {}, + } } From 8baa8e080eb0724d831341f22d25fd5607ff4b76 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Mon, 29 Jul 2024 13:25:34 +0000 Subject: [PATCH 18/21] revert --- tests/test_processing_common.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index bb4d86d3f5a500..074aa2f1d62545 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -250,13 +250,13 @@ def test_doubly_passed_kwargs(self): input_str = ["lower newer"] image_input = self.prepare_image_inputs() - inputs = processor( - text=input_str, - images=image_input, - images_kwargs={"size": {"height": 222, "width": 222}}, - size={"height": 35, "width": 35}, - ) - self.assertEqual(inputs["pixel_values"][0].shape[2], 35) + with self.assertRaises(ValueError): + _ = processor( + text=input_str, + images=image_input, + images_kwargs={"crop_size": {"height": 222, "width": 222}}, + crop_size={"height": 214, "width": 214}, + ) @require_torch @require_vision From 39f28afc08d3d4a371112c830acdd5a73d71e87b Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Mon, 5 Aug 2024 06:54:07 +0000 Subject: [PATCH 19/21] make CI happy --- .../test_processor_grounding_dino.py | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index 448aa8f7fb6433..32c61e407df027 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -22,6 +22,7 @@ import pytest from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor +from transformers.models.auto.processing_auto import processor_class_from_name from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES from transformers.testing_utils import require_torch, require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available @@ -80,6 +81,17 @@ def setUp(self): self.embed_dim = 5 self.seq_length = 5 + def get_component(self, attribute, **kwargs): + assert attribute in self.processor_class.attributes + component_class_name = getattr(self.processor_class, f"{attribute}_class") + if isinstance(component_class_name, tuple): + component_class_name = component_class_name[0] + + component_class = processor_class_from_name(component_class_name) + component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa + + return component + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert def get_tokenizer(self, **kwargs): return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) @@ -264,6 +276,151 @@ def test_model_input_names(self): self.assertListEqual(list(inputs.keys()), processor.model_input_names) + @require_torch + @require_vision + def test_image_processor_defaults_preserved_by_image_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", size={"height": 234, "width": 234}) + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + self.assertEqual(len(inputs["pixel_values"][0][0]), 234) + + @require_vision + @require_torch + def test_kwargs_overrides_default_tokenizer_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112) + self.assertEqual(len(inputs["input_ids"][0]), 4) + + @require_vision + @require_torch + def test_tokenizer_defaults_preserved_by_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(len(inputs["input_ids"][0]), 4) + + @require_torch + @require_vision + def test_kwargs_overrides_default_image_processor_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor", size=(234, 234)) + tokenizer = self.get_component("tokenizer", max_length=117) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, size=[224, 224]) + self.assertEqual(len(inputs["pixel_values"][0][0]), 224) + + @require_torch + @require_vision + def test_structured_kwargs_nested(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"size": {"height": 214, "width": 214}}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.skip_processor_without_typed_kwargs(processor) + + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_structured_kwargs_nested_from_dict(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"size": {"height": 214, "width": 214}}, + "text_kwargs": {"padding": "max_length", "max_length": 76}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.assertEqual(inputs["pixel_values"].shape[2], 214) + + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_unstructured_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + size={"height": 214, "width": 214}, + padding="max_length", + max_length=76, + ) + + self.assertEqual(inputs["pixel_values"].shape[2], 214) + self.assertEqual(len(inputs["input_ids"][0]), 76) + @require_torch @require_vision def test_unstructured_kwargs_batched(self): From 7366aab07281f4792a5d85626efcf65b27b62a78 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Mon, 5 Aug 2024 14:12:19 +0000 Subject: [PATCH 20/21] nit --- .../test_processor_grounding_dino.py | 22 +++++-------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py index 32c61e407df027..c0bb186b392eb0 100644 --- a/tests/models/grounding_dino/test_processor_grounding_dino.py +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -22,7 +22,6 @@ import pytest from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor -from transformers.models.auto.processing_auto import processor_class_from_name from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES from transformers.testing_utils import require_torch, require_vision from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available @@ -81,17 +80,6 @@ def setUp(self): self.embed_dim = 5 self.seq_length = 5 - def get_component(self, attribute, **kwargs): - assert attribute in self.processor_class.attributes - component_class_name = getattr(self.processor_class, f"{attribute}_class") - if isinstance(component_class_name, tuple): - component_class_name = component_class_name[0] - - component_class = processor_class_from_name(component_class_name) - component = component_class.from_pretrained(self.tmpdirname, **kwargs) # noqa - - return component - # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert def get_tokenizer(self, **kwargs): return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) @@ -306,8 +294,10 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112) - self.assertEqual(len(inputs["input_ids"][0]), 4) + inputs = processor( + text=input_str, images=image_input, return_tensors="pt", padding="max_length", max_length=112 + ) + self.assertEqual(len(inputs["input_ids"][0]), 112) @require_vision @require_torch @@ -322,8 +312,8 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): input_str = "lower newer" image_input = self.prepare_image_inputs() - inputs = processor(text=input_str, images=image_input, return_tensors="pt") - self.assertEqual(len(inputs["input_ids"][0]), 4) + inputs = processor(text=input_str, images=image_input, return_tensors="pt", padding="max_length") + self.assertEqual(len(inputs["input_ids"][0]), 117) @require_torch @require_vision From 64839abcb2ab0424f1b8f83a90f184a3b98ddfe8 Mon Sep 17 00:00:00 2001 From: sangbumchoi Date: Wed, 7 Aug 2024 22:44:18 +0000 Subject: [PATCH 21/21] import annotationformat --- .../models/grounding_dino/processing_grounding_dino.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 444b29085b0142..00c183338be056 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -22,7 +22,7 @@ from ...image_processing_utils import BatchFeature from ...image_transforms import center_to_corners_format -from ...image_utils import ImageInput +from ...image_utils import AnnotationFormat, ImageInput from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin @@ -32,7 +32,7 @@ from typing_extensions import Unpack from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput -from ...utils import ExplicitEnum, TensorType, is_torch_available +from ...utils import TensorType, is_torch_available if is_torch_available(): @@ -42,11 +42,6 @@ AnnotationType = Dict[str, Union[int, str, List[Dict]]] -class AnnotationFormat(ExplicitEnum): - COCO_DETECTION = "coco_detection" - COCO_PANOPTIC = "coco_panoptic" - - def get_phrases_from_posmap(posmaps, input_ids): """Get token ids of phrases from posmaps and input_ids.