Refactor _validate_images_text_input_order

huggingface · Sep 13, 2024 · 3d8ec3d · 3d8ec3d
1 parent 04918e7
commit 3d8ec3d
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 5 deletions.
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
@@ -21,7 +21,7 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, get_image_size, to_numpy_array
-from ...processing_utils import ProcessingKwargs, ProcessorMixin, _check_reversed_images_text_for_vlms
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, _validate_images_text_input_order
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
 
@@ -109,6 +109,12 @@ def __call__(
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -120,10 +126,11 @@ def __call__(
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         if images is None and text is None:
-            raise ValueError("You have to specify at least images or text.")
+            raise ValueError("You have to specify at least one of `images` or `text`.")
 
         # check if images and text inputs are reversed for BC
-        images, text = _check_reversed_images_text_for_vlms(images, text)
+        text, images = images, text
+        images, text = _validate_images_text_input_order(images, text)
 
         output_kwargs = self._merge_kwargs(
             LlavaProcessorKwargs,

diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py
@@ -15,14 +15,15 @@
 import tempfile
 import unittest
 
+from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin
 
 
 if is_vision_available():
-    from transformers import AutoProcessor, AutoTokenizer, CLIPImageProcessor, LlamaTokenizerFast, LlavaProcessor
+    from transformers import CLIPImageProcessor
 
 
 @require_vision
@@ -34,7 +35,7 @@ def setUp(self):
         image_processor = CLIPImageProcessor(do_center_crop=False)
         tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
 
-        processor = LlavaProcessor(image_processor, tokenizer)
+        processor = LlavaProcessor(image_processor=image_processor, tokenizer=tokenizer)
 
         processor.save_pretrained(self.tmpdirname)