Skip to content

Commit

Permalink
change size and crop_size in processor kwargs tests to do_rescale and…
Browse files Browse the repository at this point in the history
… rescale_factor
  • Loading branch information
yonigozlan committed Sep 13, 2024
1 parent 86211d3 commit df6ddaf
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 360 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ class LlavaOnevisionProcessor(ProcessorMixin):
r"""
Constructs a LLaVa-Onevision processor which wraps a LLaVa-Onevision video processor, LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
[`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the
[`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaOnevisionImageProcessor`] and [`LlamaTokenizerFast`]. See the
[`~LlavaOnevisionVideoProcessor.__call__`], [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
Args:
image_processor ([`LlavaNextImageProcessor`], *optional*):
image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`LlamaTokenizerFast`], *optional*):
The tokenizer is a required input.
Expand All @@ -82,7 +82,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
"image_token",
"video_token",
]
image_processor_class = "AutoImageProcessor"
image_processor_class = "LlavaOnevisionImageProcessor"
tokenizer_class = "AutoTokenizer"
video_processor_class = "LlavaOnevisionVideoProcessor"

Expand All @@ -96,7 +96,6 @@ def __init__(
chat_template=None,
image_token="<image>",
video_token="<video>",
**kwargs: Unpack[LlavaOnevisionProcessorKwargs],
):
self.num_image_tokens = num_image_tokens
self.vision_feature_select_strategy = vision_feature_select_strategy
Expand All @@ -108,8 +107,9 @@ def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
audio=None,
videos: VideoInput = None,
**kwargs,
**kwargs: Unpack[LlavaOnevisionProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
Expand Down
203 changes: 1 addition & 202 deletions tests/models/llava_onevision/test_processing_llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import tempfile
import unittest

from transformers.testing_utils import require_torch, require_vision
from transformers.testing_utils import require_vision
from transformers.utils import is_vision_available

from ...test_processing_common import ProcessorTesterMixin
Expand Down Expand Up @@ -74,204 +74,3 @@ def test_chat_template(self):

formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
self.assertEqual(expected_prompt, formatted_prompt)

@require_torch
@require_vision
def test_image_processor_defaults_preserved_by_image_kwargs(self):
# Rewrite as llava-next image processor return pixel values with an added dimesion for image patches
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", size=(234, 234))
video_processor = self.get_component("video_processor", size=(234, 234))
tokenizer = self.get_component("tokenizer", max_length=117)

processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()

inputs = processor(text=input_str, images=image_input)
# added dimension for image patches
self.assertEqual(len(inputs["pixel_values"][0][0][0]), 234)

@require_torch
@require_vision
def test_kwargs_overrides_default_image_processor_kwargs(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor", crop_size=(234, 234))
video_processor = self.get_component("video_processor", size=(234, 234))
tokenizer = self.get_component("tokenizer", max_length=117)

processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()

inputs = processor(text=input_str, images=image_input, size=[224, 224])
# added dimension for image patches
self.assertEqual(len(inputs["pixel_values"][0][0][0]), 224)

@require_torch
@require_vision
def test_unstructured_kwargs(self):
image_processor = self.get_component("image_processor")
video_processor = self.get_component("video_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
size={"height": 214, "width": 214},
padding="max_length",
max_length=76,
)

# added dimension for image patches
self.assertEqual(inputs["pixel_values"].shape[3], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)

@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
image_processor = self.get_component("image_processor")
video_processor = self.get_component("video_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
self.skip_processor_without_typed_kwargs(processor)

input_str = ["lower newer", "upper older longer string"]
image_input = self.prepare_image_inputs() * 2
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
size={"height": 214, "width": 214},
padding="longest",
max_length=76,
)
self.assertEqual(inputs["pixel_values"].shape[3], 214)
self.assertEqual(len(inputs["input_ids"][0]), 5)

@require_torch
@require_vision
def test_structured_kwargs_nested(self):
image_processor = self.get_component("image_processor")
video_processor = self.get_component("video_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()

# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"size": {"height": 214, "width": 214}},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}

inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.skip_processor_without_typed_kwargs(processor)

self.assertEqual(inputs["pixel_values"].shape[3], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)

@require_torch
@require_vision
def test_structured_kwargs_nested_from_dict(self):
image_processor = self.get_component("image_processor")
video_processor = self.get_component("video_processor")
tokenizer = self.get_component("tokenizer")

processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()

# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"images_kwargs": {"size": {"height": 214, "width": 214}},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}

inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(inputs["pixel_values"].shape[3], 214)
self.assertEqual(len(inputs["input_ids"][0]), 76)

@require_torch
@require_vision
def test_doubly_passed_kwargs(self):
image_processor = self.get_component("image_processor")
video_processor = self.get_component("video_processor")
tokenizer = self.get_component("tokenizer")

processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
self.skip_processor_without_typed_kwargs(processor)

input_str = ["lower newer"]
image_input = self.prepare_image_inputs()
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
images_kwargs={"size": {"height": 222, "width": 222}},
size={"height": 214, "width": 214},
)

@require_vision
@require_torch
def test_kwargs_overrides_default_tokenizer_kwargs(self):
image_processor = self.get_component("image_processor")
video_processor = self.get_component("video_processor")
tokenizer = self.get_component("tokenizer", max_length=117)

processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()

inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112)
self.assertEqual(len(inputs["input_ids"][0]), 112)

@require_vision
@require_torch
def test_tokenizer_defaults_preserved_by_kwargs(self):
image_processor = self.get_component("image_processor")
video_processor = self.get_component("video_processor")
tokenizer = self.get_component("tokenizer", max_length=117)

processor = self.processor_class(
tokenizer=tokenizer, image_processor=image_processor, video_processor=video_processor
)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()

inputs = processor(text=input_str, images=image_input, return_tensors="pt")
self.assertEqual(len(inputs["input_ids"][0]), 117)
127 changes: 0 additions & 127 deletions tests/models/qwen2_vl/test_processing_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,130 +108,3 @@ def test_model_input_names(self):
inputs = processor(text=input_str, images=image_input, videos=video_inputs)

self.assertListEqual(list(inputs.keys()), processor.model_input_names)

# Qwen2-VL doesn't accept `size` and resized to an optimal size using image_processor attrbutes
# defined at `init`. Therefore, all tests are overwritten and don't actually test if kwargs are passed
# to image processors
def test_image_processor_defaults_preserved_by_image_kwargs(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()

inputs = processor(text=input_str, images=image_input)
self.assertEqual(inputs["pixel_values"].shape[0], 800)

def test_kwargs_overrides_default_image_processor_kwargs(self):
image_processor = self.get_component(
"image_processor",
)
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()

inputs = processor(text=input_str, images=image_input)
self.assertEqual(inputs["pixel_values"].shape[0], 800)

def test_unstructured_kwargs(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
padding="max_length",
max_length=76,
)

self.assertEqual(inputs["pixel_values"].shape[0], 800)
self.assertEqual(len(inputs["input_ids"][0]), 76)

def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = ["lower newer", "upper older longer string"]
image_input = self.prepare_image_inputs() * 2
inputs = processor(
text=input_str,
images=image_input,
return_tensors="pt",
padding="longest",
max_length=76,
)

self.assertEqual(inputs["pixel_values"].shape[0], 1600)
self.assertEqual(len(inputs["input_ids"][0]), 4)

def test_structured_kwargs_nested(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
image_input = self.prepare_image_inputs()

# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}

inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.skip_processor_without_typed_kwargs(processor)

self.assertEqual(inputs["pixel_values"].shape[0], 800)
self.assertEqual(len(inputs["input_ids"][0]), 76)

def test_structured_kwargs_nested_from_dict(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = "lower newer"
image_input = self.prepare_image_inputs()

# Define the kwargs for each modality
all_kwargs = {
"common_kwargs": {"return_tensors": "pt"},
"text_kwargs": {"padding": "max_length", "max_length": 76},
}

inputs = processor(text=input_str, images=image_input, **all_kwargs)
self.assertEqual(inputs["pixel_values"].shape[0], 800)
self.assertEqual(len(inputs["input_ids"][0]), 76)

def test_image_processor_defaults_preserved_by_video_kwargs(self):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")

processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)

input_str = "lower newer"
video_input = self.prepare_video_inputs()

inputs = processor(text=input_str, videos=video_input)
self.assertEqual(inputs["pixel_values_videos"].shape[0], 9600)
Loading

0 comments on commit df6ddaf

Please sign in to comment.