change size and crop_size in processor kwargs tests to do_rescale and…

… rescale_factor
huggingface · Sep 13, 2024 · df6ddaf · df6ddaf
1 parent 86211d3
commit df6ddaf
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 360 deletions.
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -51,11 +51,11 @@ class LlavaOnevisionProcessor(ProcessorMixin):
     r"""
     Constructs a LLaVa-Onevision processor which wraps a LLaVa-Onevision video processor, LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
 
-    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaOnevisionImageProcessor`] and [`LlamaTokenizerFast`]. See the
     [`~LlavaOnevisionVideoProcessor.__call__`], [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`LlavaNextImageProcessor`], *optional*):
+        image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`], *optional*):
             The tokenizer is a required input.
@@ -82,7 +82,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
         "image_token",
         "video_token",
     ]
-    image_processor_class = "AutoImageProcessor"
+    image_processor_class = "LlavaOnevisionImageProcessor"
     tokenizer_class = "AutoTokenizer"
     video_processor_class = "LlavaOnevisionVideoProcessor"
 
@@ -96,7 +96,6 @@ def __init__(
         chat_template=None,
         image_token="<image>",
         video_token="<video>",
-        **kwargs: Unpack[LlavaOnevisionProcessorKwargs],
     ):
         self.num_image_tokens = num_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -108,8 +107,9 @@ def __call__(
         self,
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
         videos: VideoInput = None,
-        **kwargs,
+        **kwargs: Unpack[LlavaOnevisionProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`

diff --git a/tests/models/llava_onevision/test_processing_llava_onevision.py b/tests/models/llava_onevision/test_processing_llava_onevision.py
@@ -15,7 +15,7 @@
 import tempfile
 import unittest
 
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_vision
 from transformers.utils import is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
@@ -74,204 +74,3 @@ def test_chat_template(self):
 
         formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
         self.assertEqual(expected_prompt, formatted_prompt)
-
-    @require_torch
-    @require_vision
-    def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        # Rewrite as llava-next image processor return pixel values with an added dimesion for image patches
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234))
-        video_processor = self.get_component("video_processor", size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        # added dimension for image patches
-        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)
-
-    @require_torch
-    @require_vision
-    def test_kwargs_overrides_default_image_processor_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234))
-        video_processor = self.get_component("video_processor", size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, size=[224, 224])
-        # added dimension for image patches
-        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)
-
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer")
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="max_length",
-            max_length=76,
-        )
-
-        # added dimension for image patches
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs_batched(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer")
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="longest",
-            max_length=76,
-        )
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 5)
-
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer")
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested_from_dict(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    @require_torch
-    @require_vision
-    def test_doubly_passed_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer"]
-        image_input = self.prepare_image_inputs()
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                images=image_input,
-                images_kwargs={"size": {"height": 222, "width": 222}},
-                size={"height": 214, "width": 214},
-            )
-
-    @require_vision
-    @require_torch
-    def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
-        self.assertEqual(len(inputs["input_ids"][0]), 112)
-
-    @require_vision
-    @require_torch
-    def test_tokenizer_defaults_preserved_by_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs["input_ids"][0]), 117)
diff --git a/tests/models/qwen2_vl/test_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_processing_qwen2_vl.py
@@ -108,130 +108,3 @@ def test_model_input_names(self):
         inputs = processor(text=input_str, images=image_input, videos=video_inputs)
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
-
-    # Qwen2-VL doesn't accept `size` and resized to an optimal size using image_processor attrbutes
-    # defined at `init`. Therefore, all tests are overwritten and don't actually test if kwargs are passed
-    # to image processors
-    def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(inputs["pixel_values"].shape[0], 800)
-
-    def test_kwargs_overrides_default_image_processor_kwargs(self):
-        image_processor = self.get_component(
-            "image_processor",
-        )
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(inputs["pixel_values"].shape[0], 800)
-
-    def test_unstructured_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            padding="max_length",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[0], 800)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            padding="longest",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[0], 1600)
-        self.assertEqual(len(inputs["input_ids"][0]), 4)
-
-    def test_structured_kwargs_nested(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        self.assertEqual(inputs["pixel_values"].shape[0], 800)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    def test_structured_kwargs_nested_from_dict(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[0], 800)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    def test_image_processor_defaults_preserved_by_video_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        video_input = self.prepare_video_inputs()
-
-        inputs = processor(text=input_str, videos=video_input)
-        self.assertEqual(inputs["pixel_values_videos"].shape[0], 9600)