diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index e050ec3f31deea..87e716b5855fe9 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -51,11 +51,11 @@ class LlavaOnevisionProcessor(ProcessorMixin):
     r"""
     Constructs a LLaVa-Onevision processor which wraps a LLaVa-Onevision video processor, LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
 
-    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaOnevisionImageProcessor`] and [`LlamaTokenizerFast`]. See the
     [`~LlavaOnevisionVideoProcessor.__call__`], [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`LlavaNextImageProcessor`], *optional*):
+        image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`], *optional*):
             The tokenizer is a required input.
@@ -82,7 +82,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
         "image_token",
         "video_token",
     ]
-    image_processor_class = "AutoImageProcessor"
+    image_processor_class = "LlavaOnevisionImageProcessor"
     tokenizer_class = "AutoTokenizer"
     video_processor_class = "LlavaOnevisionVideoProcessor"
 
@@ -96,7 +96,6 @@ def __init__(
         chat_template=None,
         image_token="<image>",
         video_token="<video>",
-        **kwargs: Unpack[LlavaOnevisionProcessorKwargs],
     ):
         self.num_image_tokens = num_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -108,8 +107,9 @@ def __call__(
         self,
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
         videos: VideoInput = None,
-        **kwargs,
+        **kwargs: Unpack[LlavaOnevisionProcessorKwargs],
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
diff --git a/tests/models/llava_onevision/test_processing_llava_onevision.py b/tests/models/llava_onevision/test_processing_llava_onevision.py
index e045f2ba7f0ba2..ab0fe51318f669 100644
--- a/tests/models/llava_onevision/test_processing_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processing_llava_onevision.py
@@ -15,7 +15,7 @@
 import tempfile
 import unittest
 
-from transformers.testing_utils import require_torch, require_vision
+from transformers.testing_utils import require_vision
 from transformers.utils import is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
@@ -74,204 +74,3 @@ def test_chat_template(self):
 
         formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
         self.assertEqual(expected_prompt, formatted_prompt)
-
-    @require_torch
-    @require_vision
-    def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        # Rewrite as llava-next image processor return pixel values with an added dimesion for image patches
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", size=(234, 234))
-        video_processor = self.get_component("video_processor", size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        # added dimension for image patches
-        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)
-
-    @require_torch
-    @require_vision
-    def test_kwargs_overrides_default_image_processor_kwargs(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor", crop_size=(234, 234))
-        video_processor = self.get_component("video_processor", size=(234, 234))
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, size=[224, 224])
-        # added dimension for image patches
-        self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)
-
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer")
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="max_length",
-            max_length=76,
-        )
-
-        # added dimension for image patches
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    @require_torch
-    @require_vision
-    def test_unstructured_kwargs_batched(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer")
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            size={"height": 214, "width": 214},
-            padding="longest",
-            max_length=76,
-        )
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 5)
-
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer")
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    @require_torch
-    @require_vision
-    def test_structured_kwargs_nested_from_dict(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {"size": {"height": 214, "width": 214}},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    @require_torch
-    @require_vision
-    def test_doubly_passed_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer"]
-        image_input = self.prepare_image_inputs()
-        with self.assertRaises(ValueError):
-            _ = processor(
-                text=input_str,
-                images=image_input,
-                images_kwargs={"size": {"height": 222, "width": 222}},
-                size={"height": 214, "width": 214},
-            )
-
-    @require_vision
-    @require_torch
-    def test_kwargs_overrides_default_tokenizer_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
-        self.assertEqual(len(inputs["input_ids"][0]), 112)
-
-    @require_vision
-    @require_torch
-    def test_tokenizer_defaults_preserved_by_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        video_processor = self.get_component("video_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117)
-
-        processor = self.processor_class(
-            tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
-        )
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(len(inputs["input_ids"][0]), 117)
diff --git a/tests/models/qwen2_vl/test_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_processing_qwen2_vl.py
index d1ae16a9aa46e2..a360fc98f4c584 100644
--- a/tests/models/qwen2_vl/test_processing_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processing_qwen2_vl.py
@@ -108,130 +108,3 @@ def test_model_input_names(self):
         inputs = processor(text=input_str, images=image_input, videos=video_inputs)
 
         self.assertListEqual(list(inputs.keys()), processor.model_input_names)
-
-    # Qwen2-VL doesn't accept `size` and resized to an optimal size using image_processor attrbutes
-    # defined at `init`. Therefore, all tests are overwritten and don't actually test if kwargs are passed
-    # to image processors
-    def test_image_processor_defaults_preserved_by_image_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(inputs["pixel_values"].shape[0], 800)
-
-    def test_kwargs_overrides_default_image_processor_kwargs(self):
-        image_processor = self.get_component(
-            "image_processor",
-        )
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        inputs = processor(text=input_str, images=image_input)
-        self.assertEqual(inputs["pixel_values"].shape[0], 800)
-
-    def test_unstructured_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            padding="max_length",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[0], 800)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    def test_unstructured_kwargs_batched(self):
-        if "image_processor" not in self.processor_class.attributes:
-            self.skipTest(f"image_processor attribute not present in {self.processor_class}")
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = ["lower newer", "upper older longer string"]
-        image_input = self.prepare_image_inputs() * 2
-        inputs = processor(
-            text=input_str,
-            images=image_input,
-            return_tensors="pt",
-            padding="longest",
-            max_length=76,
-        )
-
-        self.assertEqual(inputs["pixel_values"].shape[0], 1600)
-        self.assertEqual(len(inputs["input_ids"][0]), 4)
-
-    def test_structured_kwargs_nested(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        self.assertEqual(inputs["pixel_values"].shape[0], 800)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    def test_structured_kwargs_nested_from_dict(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-        input_str = "lower newer"
-        image_input = self.prepare_image_inputs()
-
-        # Define the kwargs for each modality
-        all_kwargs = {
-            "common_kwargs": {"return_tensors": "pt"},
-            "text_kwargs": {"padding": "max_length", "max_length": 76},
-        }
-
-        inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs["pixel_values"].shape[0], 800)
-        self.assertEqual(len(inputs["input_ids"][0]), 76)
-
-    def test_image_processor_defaults_preserved_by_video_kwargs(self):
-        image_processor = self.get_component("image_processor")
-        tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
-
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        self.skip_processor_without_typed_kwargs(processor)
-
-        input_str = "lower newer"
-        video_input = self.prepare_video_inputs()
-
-        inputs = processor(text=input_str, videos=video_input)
-        self.assertEqual(inputs["pixel_values_videos"].shape[0], 9600)
diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py
index a02cf310bfeaac..1023e7e03f22d6 100644
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@@ -153,7 +153,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
         processor_components["image_processor"] = self.get_component(
-            "image_processor", size=(234, 234), crop_size=(234, 234)
+            "image_processor", do_rescale=True, rescale_factor=-1
         )
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
 
@@ -164,7 +164,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         image_input = self.prepare_image_inputs()
 
         inputs = processor(text=input_str, images=image_input, return_tensors="pt")
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 234)
+        self.assertLessEqual(inputs[self.images_data_arg_name].mean(), 0)
 
     def test_kwargs_overrides_default_tokenizer_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
@@ -186,7 +186,9 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         processor_components = self.prepare_components()
-        processor_components["image_processor"] = self.get_component("image_processor", size=(234, 234))
+        processor_components["image_processor"] = self.get_component(
+            "image_processor", do_rescale=True, rescale_factor=1
+        )
         processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length")
 
         processor = self.processor_class(**processor_components)
@@ -195,10 +197,8 @@ def test_kwargs_overrides_default_image_processor_kwargs(self):
         input_str = "lower newer"
         image_input = self.prepare_image_inputs()
 
-        inputs = processor(
-            text=input_str, images=image_input, size=[224, 224], crop_size=(224, 224), return_tensors="pt"
-        )
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 224)
+        inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt")
+        self.assertLessEqual(inputs[self.images_data_arg_name].mean(), 0)
 
     def test_unstructured_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
@@ -213,13 +213,13 @@ def test_unstructured_kwargs(self):
             text=input_str,
             images=image_input,
             return_tensors="pt",
-            size={"height": 214, "width": 214},
-            crop_size={"height": 214, "width": 214},
+            do_rescale=True,
+            rescale_factor=-1,
             padding="max_length",
             max_length=76,
         )
 
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertLessEqual(inputs[self.images_data_arg_name].mean(), 0)
         self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
 
     def test_unstructured_kwargs_batched(self):
@@ -235,13 +235,13 @@ def test_unstructured_kwargs_batched(self):
             text=input_str,
             images=image_input,
             return_tensors="pt",
-            size={"height": 214, "width": 214},
-            crop_size={"height": 214, "width": 214},
+            do_rescale=True,
+            rescale_factor=-1,
             padding="longest",
             max_length=76,
         )
 
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertLessEqual(inputs[self.images_data_arg_name].mean(), 0)
         self.assertTrue(
             len(inputs[self.text_data_arg_name][0]) == len(inputs[self.text_data_arg_name][1])
             and len(inputs[self.text_data_arg_name][1]) < 76
@@ -260,9 +260,8 @@ def test_doubly_passed_kwargs(self):
             _ = processor(
                 text=input_str,
                 images=image_input,
-                images_kwargs={"size": {"height": 222, "width": 222}},
-                size={"height": 214, "width": 214},
-                crop_size={"height": 214, "width": 214},
+                images_kwargs={"do_rescale": True, "rescale_factor": -1},
+                do_rescale=True,
                 return_tensors="pt",
             )
 
@@ -279,17 +278,14 @@ def test_structured_kwargs_nested(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {
-                "size": {"height": 214, "width": 214},
-                "crop_size": {"height": 214, "width": 214},
-            },
+            "images_kwargs": {"do_rescale": True, "rescale_factor": -1},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
         self.skip_processor_without_typed_kwargs(processor)
 
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertLessEqual(inputs[self.images_data_arg_name].mean(), 0)
         self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)
 
     def test_structured_kwargs_nested_from_dict(self):
@@ -304,15 +300,12 @@ def test_structured_kwargs_nested_from_dict(self):
         # Define the kwargs for each modality
         all_kwargs = {
             "common_kwargs": {"return_tensors": "pt"},
-            "images_kwargs": {
-                "size": {"height": 214, "width": 214},
-                "crop_size": {"height": 214, "width": 214},
-            },
+            "images_kwargs": {"do_rescale": True, "rescale_factor": -1},
             "text_kwargs": {"padding": "max_length", "max_length": 76},
         }
 
         inputs = processor(text=input_str, images=image_input, **all_kwargs)
-        self.assertEqual(inputs[self.images_data_arg_name].shape[-1], 214)
+        self.assertLessEqual(inputs[self.images_data_arg_name].mean(), 0)
         self.assertEqual(inputs[self.text_data_arg_name].shape[-1], 76)