diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 59ad219fca059c..4600d1063b2f52 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -21,7 +21,7 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, get_image_size, to_numpy_array -from ...processing_utils import ProcessingKwargs, ProcessorMixin, _check_reversed_images_text_for_vlms +from ...processing_utils import ProcessingKwargs, ProcessorMixin, _validate_images_text_input_order from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging @@ -109,6 +109,12 @@ def __call__( The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: @@ -120,10 +126,11 @@ def __call__( - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ if images is None and text is None: - raise ValueError("You have to specify at least images or text.") + raise ValueError("You have to specify at least one of `images` or `text`.") # check if images and text inputs are reversed for BC - images, text = _check_reversed_images_text_for_vlms(images, text) + text, images = images, text + images, text = _validate_images_text_input_order(images, text) output_kwargs = self._merge_kwargs( LlavaProcessorKwargs, diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py index ab9904a6c59f27..5b05a8b92ea513 100644 --- a/tests/models/llava/test_processor_llava.py +++ b/tests/models/llava/test_processor_llava.py @@ -15,6 +15,7 @@ import tempfile import unittest +from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_vision_available @@ -22,7 +23,7 @@ if is_vision_available(): - from transformers import AutoProcessor, AutoTokenizer, CLIPImageProcessor, LlamaTokenizerFast, LlavaProcessor + from transformers import CLIPImageProcessor @require_vision @@ -34,7 +35,7 @@ def setUp(self): image_processor = CLIPImageProcessor(do_center_crop=False) tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b") - processor = LlavaProcessor(image_processor, tokenizer) + processor = LlavaProcessor(image_processor=image_processor, tokenizer=tokenizer) processor.save_pretrained(self.tmpdirname)