From cb961ada70d1a7d3fab520635a41c1aa161680c1 Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Mon, 16 Sep 2024 20:14:30 +0000
Subject: [PATCH] fix failing tests

---
 .../llava_next/test_processor_llava_next.py   | 144 +++++++++++++++++-
 .../pix2struct/test_processor_pix2struct.py   |  49 +++++-
 2 files changed, 185 insertions(+), 8 deletions(-)

diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py
index e10e7fd38fba2f..06c7ebda755d42 100644
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -17,7 +17,10 @@
 
 import torch
 
-from transformers.testing_utils import require_vision
+from transformers.testing_utils import (
+    require_torch,
+    require_vision,
+)
 from transformers.utils import is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
@@ -96,3 +99,142 @@ def test_image_token_filling(self):
         )
         image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
         self.assertEqual(expected_image_tokens, image_tokens)
+
+    @require_torch
+    @require_vision
+    def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input)
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)
+
+    @require_torch
+    @require_vision
+    def test_kwargs_overrides_default_image_processor_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor", crop_size=(234, 234))
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, crop_size=[224, 224])
+        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        self.assertEqual(inputs["pixel_values"].shape[-1], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_structured_kwargs_nested_from_dict(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        # Define the kwargs for each modality
+        all_kwargs = {
+            "common_kwargs": {"return_tensors": "pt"},
+            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "text_kwargs": {"padding": "max_length", "max_length": 76},
+        }
+
+        inputs = processor(text=input_str, images=image_input, **all_kwargs)
+        self.assertEqual(inputs["pixel_values"].shape[-1], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            padding="max_length",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[-1], 214)
+        self.assertEqual(len(inputs["input_ids"][0]), 76)
+
+    @require_torch
+    @require_vision
+    def test_unstructured_kwargs_batched(self):
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+
+        input_str = ["lower newer", "upper older longer string"]
+        image_input = self.prepare_image_inputs() * 2
+        inputs = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            crop_size={"height": 214, "width": 214},
+            padding="longest",
+            max_length=76,
+        )
+
+        self.assertEqual(inputs["pixel_values"].shape[-1], 214)
+
+        self.assertEqual(len(inputs["input_ids"][0]), 5)
diff --git a/tests/models/pix2struct/test_processor_pix2struct.py b/tests/models/pix2struct/test_processor_pix2struct.py
index 7be23af9679430..a0cf25528ca7e3 100644
--- a/tests/models/pix2struct/test_processor_pix2struct.py
+++ b/tests/models/pix2struct/test_processor_pix2struct.py
@@ -37,7 +37,6 @@
 @require_torch
 class Pix2StructProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = Pix2StructProcessor
-    text_data_arg_name = "decoder_input_ids"
 
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
@@ -182,10 +181,27 @@ def test_model_input_names(self):
         # For now the processor supports only ["flattened_patches", "input_ids", "attention_mask", "decoder_attention_mask"]
         self.assertListEqual(list(inputs.keys()), ["input_ids", "attention_mask"])
 
-    # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
+    @require_vision
+    @require_torch
+    def test_tokenizer_defaults_preserved_by_kwargs(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+        self.assertEqual(len(inputs["decoder_input_ids"][0]), 117)
+
     @require_torch
     @require_vision
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
+        # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
@@ -200,10 +216,29 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         inputs = processor(text=input_str, images=image_input)
         self.assertEqual(len(inputs["flattened_patches"][0][0]), 194)
 
-    # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
+    @require_vision
+    @require_torch
+    def test_kwargs_overrides_default_tokenizer_kwargs(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
+        if "image_processor" not in self.processor_class.attributes:
+            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
+        image_processor = self.get_component("image_processor")
+        tokenizer = self.get_component("tokenizer", padding="longest")
+
+        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        self.skip_processor_without_typed_kwargs(processor)
+        input_str = "lower newer"
+        image_input = self.prepare_image_inputs()
+
+        inputs = processor(
+            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
+        )
+        self.assertEqual(len(inputs["decoder_input_ids"][0]), 112)
+
     @require_torch
     @require_vision
     def test_kwargs_overrides_default_image_processor_kwargs(self):
+        # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", max_patches=4096)
@@ -218,10 +253,10 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         inputs = processor(text=input_str, images=image_input, max_patches=1024)
         self.assertEqual(len(inputs["flattened_patches"][0]), 1024)
 
-    # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -244,10 +279,10 @@ def test_unstructured_kwargs(self):
         self.assertEqual(inputs["flattened_patches"].shape[1], 1024)
         self.assertEqual(len(inputs["decoder_input_ids"][0]), 76)
 
-    # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -271,10 +306,10 @@ def test_unstructured_kwargs_batched(self):
 
         self.assertEqual(len(inputs["decoder_input_ids"][0]), 5)
 
-    # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
     @require_torch
     @require_vision
     def test_structured_kwargs_nested(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -300,10 +335,10 @@ def test_structured_kwargs_nested(self):
 
         self.assertEqual(len(inputs["decoder_input_ids"][0]), 76)
 
-    # Rewrite as pix2struct processor return "flattened_patches" and not "pixel_values"
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
+        # Rewrite as pix2struct processor return "decoder_input_ids" and not "input_ids"
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")