diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 2a3a2edff5e440..050dfb91a1a874 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -118,10 +118,15 @@ def __call__(
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
-        # check if images and text inputs are reversed for BC
         if text is None and images is None:
             raise ValueError("You must specify either text or images.")
-        if text is not None and not isinstance(text[0], str) or images is not None and isinstance(images[0], str):
+        # check if images and text inputs are reversed for BC
+        if (
+            text is not None
+            and not isinstance(text[0], str)
+            or images is not None
+            and (isinstance(images, str) or (isinstance(images, (list, tuple)) and isinstance(images[0], str)))
+        ):
             warnings.warn(
                 "It looks like you are passing the inputs in the wrong order. You should pass the images input first and the text input second."
                 "Images and text inputs will be swapped."
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index dad505757c0664..596b9d644c546a 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -173,14 +173,13 @@ def __call__(
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
-        # Temporary fix for "paddding_side" in init_kwargs
-        _ = self.tokenizer.init_kwargs.pop("padding_side", None)
-
         output_kwargs = self._merge_kwargs(
             Kosmos2ProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
+        # Temporary fix for "paddding_side" in init_kwargs
+        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
 
         bboxes = output_kwargs["images_kwargs"].pop("bboxes", None)
         num_image_tokens = output_kwargs["images_kwargs"].pop("num_image_tokens", 64)
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 59c407690a965e..d8c62b4725fe8c 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -127,7 +127,12 @@ def __call__(
         if images is None and text is None:
             raise ValueError("You have to specify at least images or text.")
         # check if images and text inputs are reversed for BC
-        if text is not None and not isinstance(text[0], str) or images is not None and isinstance(images[0], str):
+        if (
+            text is not None
+            and not isinstance(text[0], str)
+            or images is not None
+            and (isinstance(images, str) or (isinstance(images, (list, tuple)) and isinstance(images[0], str)))
+        ):
             warnings.warn(
                 "It looks like you are passing the inputs in the wrong order. You should pass the images input first and the text input second."
                 "Images and text inputs will be swapped."
diff --git a/tests/models/fuyu/test_processing_fuyu.py b/tests/models/fuyu/test_processing_fuyu.py
index 00b0c74ca2c607..8ccfad4328cf9e 100644
--- a/tests/models/fuyu/test_processing_fuyu.py
+++ b/tests/models/fuyu/test_processing_fuyu.py
@@ -13,14 +13,15 @@
 if is_vision_available():
     from PIL import Image
 
-if is_vision_available() and is_torch_available():
-    from transformers import AutoProcessor, FuyuImageProcessor, FuyuProcessor
 
 if is_torch_available():
     import torch
 
     from transformers.models.fuyu.processing_fuyu import construct_full_unpacked_stream, full_unpacked_stream_to_tensor
 
+if is_vision_available() and is_torch_available():
+    from transformers import AutoProcessor, FuyuImageProcessor, FuyuProcessor
+
 
 @require_torch
 @require_vision
@@ -177,7 +178,7 @@ def test_fuyu_processing_multiple_image_sample(self):
     @require_vision
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        # rewrite as Fuyu supports tokenizer kwargs only when image is None.
+        # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -194,15 +195,18 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         )
         self.assertEqual(len(inputs["input_ids"][0]), 112)
 
+    @unittest.skip("Fuyu processor does not support image_processor kwargs")
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        self.skipTest(reason="Fuyu processor does not support image_processor kwargs")
+        pass
 
+    @unittest.skip("Fuyu processor does not support image_processor kwargs")
     def test_kwargs_overrides_default_image_processor_kwargs(self):
-        self.skipTest(reason="Fuyu processor does not support image_processor kwargs")
+        pass
 
     @require_vision
     @require_torch
     def test_tokenizer_defaults_preserved_by_kwargs(self):
+        # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -211,6 +215,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
+        # Fuyu uses tokenizer kwargs only when image is None.
         image_input = None
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
@@ -219,7 +224,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested(self):
-        # rewrite as Fuyu image processor does not return pixel values
+        # Rewrite as Fuyu image processor does not return pixel values
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -246,7 +251,7 @@ def test_structured_kwargs_nested(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
-        # rewrite as Fuyu image processor does not return pixel values
+        # Rewrite as Fuyu image processor does not return pixel values
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
 
@@ -272,7 +277,7 @@ def test_structured_kwargs_nested_from_dict(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):
-        # rewrite as Fuyu supports tokenizer kwargs only when image is None.
+        # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -297,7 +302,7 @@ def test_unstructured_kwargs(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
-        # rewrite as Fuyu supports tokenizer kwargs only when image is None.
+        # Rewrite as Fuyu supports tokenizer kwargs only when image is None.
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
diff --git a/tests/models/instructblip/test_processor_instructblip.py b/tests/models/instructblip/test_processor_instructblip.py
index 2a09cef2428e93..edd4d370ccb1d8 100644
--- a/tests/models/instructblip/test_processor_instructblip.py
+++ b/tests/models/instructblip/test_processor_instructblip.py
@@ -207,7 +207,7 @@ def test_model_input_names(self):
     @require_torch
     @require_vision
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        # rewrite as instructblip needs a qformer_tokenizer
+        # Rewrite as InstructBlip needs a qformer_tokenizer
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", size=(234, 234))
@@ -220,14 +220,14 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
         inputs = processor(text=input_str, images=image_input)
+
         self.assertEqual(len(inputs["pixel_values"][0][0]), 234)
 
     @require_torch
     @require_vision
     def test_kwargs_overrides_default_image_processor_kwargs(self):
-        # rewrite as instructblip needs a qformer_tokenizer
+        # Rewrite as InstructBlip needs a qformer_tokenizer
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", size=(234, 234))
@@ -241,14 +241,14 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
 
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
         inputs = processor(text=input_str, images=image_input, size=[224, 224])
+
         self.assertEqual(len(inputs["pixel_values"][0][0]), 224)
 
     @require_vision
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        # rewrite as instructblip needs a qformer_tokenizer
+        # Rewrite as InstructBlip needs a qformer_tokenizer
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -263,16 +263,16 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
         inputs = processor(
             text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
         )
+
         self.assertEqual(len(inputs["input_ids"][0]), 112)
 
     @require_torch
     @require_vision
     def test_structured_kwargs_nested(self):
-        # rewrite as instructblip needs a qformer_tokenizer
+        # Rewrite as InstructBlip needs a qformer_tokenizer
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -295,18 +295,16 @@ def test_structured_kwargs_nested(self):
             "images_kwargs": {"size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
-
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
         self.assertEqual(len(inputs["input_ids"][0]), 76)
 
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
-        # rewrite as instructblip needs a qformer_tokenizer
+        # Rewrite as InstructBlip needs a qformer_tokenizer
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
 
@@ -329,16 +327,15 @@ def test_structured_kwargs_nested_from_dict(self):
             "images_kwargs": {"size": {"height": 214, "width": 214}},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
-
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[2], 214)
 
+        self.assertEqual(inputs["pixel_values"].shape[2], 214)
         self.assertEqual(len(inputs["input_ids"][0]), 76)
 
     @require_vision
     @require_torch
     def test_tokenizer_defaults_preserved_by_kwargs(self):
-        # rewrite as instructblip needs a qformer_tokenizer
+        # Rewrite as InstructBlip needs a qformer_tokenizer
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -353,14 +350,14 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
         self.skip_processor_without_typed_kwargs(processor)
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
-
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
+
         self.assertEqual(len(inputs["input_ids"][0]), 117)
 
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):
-        # rewrite as instructblip needs a qformer_tokenizer
+        # Rewrite as InstructBlip needs a qformer_tokenizer
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -391,7 +388,7 @@ def test_unstructured_kwargs(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
-        # rewrite as instructblip needs a qformer_tokenizer
+        # Rewrite as InstructBlip needs a qformer_tokenizer
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -417,5 +414,4 @@ def test_unstructured_kwargs_batched(self):
         )
 
         self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
         self.assertEqual(len(inputs["input_ids"][0]), 11)
diff --git a/tests/models/kosmos2/test_processor_kosmos2.py b/tests/models/kosmos2/test_processor_kosmos2.py
index 6d6cc78d1a8a6a..cba6d032970998 100644
--- a/tests/models/kosmos2/test_processor_kosmos2.py
+++ b/tests/models/kosmos2/test_processor_kosmos2.py
@@ -60,7 +60,7 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
-        image_processor = CLIPImageProcessor()
+        image_processor = CLIPImageProcessor(do_center_crop=False)
 
         # We have a SentencePiece fixture for testing
         slow_tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB)
@@ -487,7 +487,7 @@ def check(texts, bboxes, expected_input_ids):
     @require_vision
     @require_torch
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        # rewrite as Kosmos-2 supports custom padding only when image is None.
+        # Rewrite as Kosmos-2 supports custom padding only when image is None.
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -506,12 +506,13 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self):
             max_length=112,
             padding="max_length",
         )
+
         self.assertEqual(len(inputs["input_ids"][0]), 112)
 
     @require_torch
     @require_vision
     def test_structured_kwargs_nested(self):
-        # rewrite to test only image_processor kwargs
+        # Rewrite to test only image_processor kwargs
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -526,7 +527,7 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
@@ -537,7 +538,7 @@ def test_structured_kwargs_nested(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
-        # rewrite to test only image_processor kwargs
+        # Rewrite to test only image_processor kwargs
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
 
@@ -552,7 +553,7 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"crop_size": {"height": 214, "width": 214}},
+            "images_kwargs": {"size": {"height": 214, "width": 214}},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
@@ -561,7 +562,7 @@ def test_structured_kwargs_nested_from_dict(self):
     @require_vision
     @require_torch
     def test_tokenizer_defaults_preserved_by_kwargs(self):
-        # rewrite as Kosmos-2 supports custom padding only when image is None.
+        # Rewrite as Kosmos-2 supports custom padding only when image is None.
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -579,7 +580,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):
-        # rewrite as Kosmos-2 supports custom padding only when image is None.
+        # Rewrite as Kosmos-2 supports custom padding only when image is None.
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -604,7 +605,7 @@ def test_unstructured_kwargs(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
-        # rewrite as Kosmos-2 supports custom padding only when image is None.
+        # Rewrite as Kosmos-2 supports custom padding only when image is None.
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -620,11 +621,9 @@ def test_unstructured_kwargs_batched(self):
             text=input_str,
             images=image_input,
             return_tensors="pt",
-            crop_size={"height": 214, "width": 214},
+            size={"height": 214, "width": 214},
             padding="longest",
             max_length=76,
         )
 
-        # self.assertEqual(inputs["pixel_values"].shape[2], 214)
-
         self.assertEqual(len(inputs["input_ids"][0]), 10)
diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py
index 70a5012b59d748..8d631a18860238 100644
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -72,7 +72,7 @@ def test_chat_template(self):
     @require_torch
     @require_vision
     def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        # rewrite as llava-next image processor return pixel values with an added dimesion for image patches
+        # Rewrite as llava-next image processor return pixel values with an added dimesion for image patches
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", crop_size=(234, 234))
@@ -91,7 +91,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
     @require_torch
     @require_vision
     def test_kwargs_overrides_default_image_processor_kwargs(self):
-        # rewrite as llava-next image processor return pixel values with an added dimesion for image patches
+        # Rewrite as llava-next image processor return pixel values with an added dimesion for image patches
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", crop_size=(234, 234))
@@ -110,7 +110,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs(self):
-        # rewrite as llava-next image processor return pixel values with an added dimesion for image patches
+        # Rewrite as llava-next image processor return pixel values with an added dimesion for image patches
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -136,7 +136,7 @@ def test_unstructured_kwargs(self):
     @require_torch
     @require_vision
     def test_unstructured_kwargs_batched(self):
-        # rewrite as llava-next image processor return pixel values with an added dimesion for image patches
+        # Rewrite as llava-next image processor return pixel values with an added dimesion for image patches
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -162,7 +162,7 @@ def test_unstructured_kwargs_batched(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested(self):
-        # rewrite as llava-next image processor return pixel values with an added dimesion for image patches
+        # Rewrite as llava-next image processor return pixel values with an added dimesion for image patches
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor")
@@ -191,7 +191,7 @@ def test_structured_kwargs_nested(self):
     @require_torch
     @require_vision
     def test_structured_kwargs_nested_from_dict(self):
-        # rewrite as llava-next image processor return pixel values with an added dimesion for image patches
+        # Rewrite as llava-next image processor return pixel values with an added dimesion for image patches
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")