diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md index 9391c9020ee502..17d2a965df9d96 100644 --- a/docs/source/en/model_doc/idefics3.md +++ b/docs/source/en/model_doc/idefics3.md @@ -40,6 +40,22 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) The original code can be found [here](). +## Idefics3Config + +[[autodoc]] Idefics3Config + + +## Idefics3Model + +[[autodoc]] Idefics3Model + - forward + +## Idefics3ForConditionalGeneration + +[[autodoc]] Idefics3ForConditionalGeneration + - forward + + ## Idefics3ImageProcessor [[autodoc]] Idefics3ImageProcessor - preprocess diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py index 4b03a3af1576fb..75e7c1da261023 100644 --- a/src/transformers/models/idefics3/configuration_idefics3.py +++ b/src/transformers/models/idefics3/configuration_idefics3.py @@ -152,8 +152,6 @@ class Idefics3Config(PretrainedConfig): The scale factor for the image encoder. pad_token_id (`int`, *optional*, defaults to 128002): The id of the padding token. - max_position_embeddings (`int`, *optional*, defaults to 131072): - The maximum length of the input sequence. Example: ```python @@ -178,7 +176,6 @@ def __init__( text_config=None, scale_factor=2, pad_token_id=128_002, - max_position_embeddings=131_072, **kwargs, ): self.image_token_id = image_token_id @@ -199,7 +196,6 @@ def __init__( elif text_config is None: logger.info("text_config is None, using default text config") text_config = CONFIG_MAPPING["llama"]( - max_position_embeddings=max_position_embeddings, rms_norm_eps=1e-5, pad_token_id=pad_token_id, tie_word_embeddings=False, diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py index ed6d2574b3912b..6681aa491405cc 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3.py +++ b/src/transformers/models/idefics3/image_processing_idefics3.py @@ -385,8 +385,6 @@ class Idefics3ImageProcessor(BaseImageProcessor): do_pad (`bool`, *optional*, defaults to `True`): Whether or not to pad the images to the largest height and width in the batch and number of images per sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width). - vision_encoder_max_size (`int`, *optional*, defaults to `364`): - Maximum size of the images accepted by the vision encoder. The images are split into patches of this size. """ model_input_names = ["pixel_values"] diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 2472a4543601ae..6201bf3850fea6 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -16,9 +16,9 @@ Processor class for Idefics3. """ -import sys -from typing import TYPE_CHECKING, List, Optional, Union import re +import sys +from typing import TYPE_CHECKING, Dict, List, Optional, Union from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, is_valid_image, load_image @@ -93,7 +93,7 @@ def get_image_prompt_string( class Idefics3ImagesKwargs(ImagesKwargs, total=False): image_seq_len: Optional[int] return_row_col_info: Optional[bool] - max_image_size: Optional[dict[str, int]] + max_image_size: Optional[Dict[str, int]] class Idefics3ProcessorKwargs(ProcessingKwargs, total=False): @@ -111,6 +111,9 @@ class Idefics3ProcessorKwargs(ProcessingKwargs, total=False): } +Idefics3ProcessorKwargs.__annotations__["images_kwargs"] = Idefics3ImagesKwargs # python 3.8 compatibility + + class Idefics3Processor(ProcessorMixin): r""" Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor. @@ -147,7 +150,7 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, ch self.global_img_token = "" self.image_seq_len = image_seq_len - self._regex_to_remove_extra_special_tokens = re.compile(r'(\n?\n?|\n?)+') + self._regex_to_remove_extra_special_tokens = re.compile(r"(\n?\n?|\n?)+") tokens_to_add = { "additional_special_tokens": [ @@ -158,7 +161,7 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, ch } tokenizer.add_special_tokens(tokens_to_add) - super().__init__(image_processor, tokenizer, chat_template=chat_template) + super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs) def _extract_images_from_prompts(self, prompts): prompt_images = [] @@ -356,7 +359,6 @@ def decode(self, *args, **kwargs): decode_output = self.tokenizer.decode(*args, **kwargs) return self._regex_to_remove_extra_special_tokens.sub("", decode_output) - @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 09f62481956e77..1ddf324c5c4b4e 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -291,6 +291,17 @@ class ModelProcessorKwargs(ProcessingKwargs, total=False): } ``` + + For Python 3.8 compatibility, when inheriting from this class and overriding one of the kwargs, + you need to manually update the __annotations__ dictionary. This can be done as follows: + + ```python + class CustomProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: CustomImagesKwargs + + CustomProcessorKwargs.__annotations__["images_kwargs"] = CustomImagesKwargs # python 3.8 compatibility + ```python + """ common_kwargs: CommonKwargs = { diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 7c8e6dc87c0ef6..2941a20239ef7e 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4817,35 +4817,42 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class Idefics3ForConditionalGeneration(metaclass=DummyObject): +class Idefics2Model(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class Idefics2Model(metaclass=DummyObject): +class Idefics2PreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class Idefics2PreTrainedModel(metaclass=DummyObject): +class Idefics2Processor(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class Idefics3PreTrainedModel(metaclass=DummyObject): +class Idefics3ForConditionalGeneration(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class Idefics2Processor(metaclass=DummyObject): +class Idefics3Model(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Idefics3PreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index 43c01ba2f0aa30..d8945db7fa7ce6 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -23,13 +23,10 @@ from transformers import ( AutoProcessor, - Idefics3Config, - Idefics3ForConditionalGeneration, - Idefics3Model, is_torch_available, is_vision_available, ) -from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device +from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_multi_gpu, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -38,6 +35,12 @@ if is_torch_available(): import torch + + from transformers import ( + Idefics3Config, + Idefics3ForConditionalGeneration, + Idefics3Model, + ) else: is_torch_greater_or_equal_than_2_0 = False @@ -483,13 +486,13 @@ def tearDown(self): torch.cuda.empty_cache() @slow + @require_torch_multi_gpu def test_integration_test(self): model = Idefics3ForConditionalGeneration.from_pretrained( "HuggingFaceM4/Idefics3-8B-Llama3", torch_dtype=torch.bfloat16, device_map="auto", ) - model.to(torch_device) # Create inputs text = "In this image, we see" @@ -500,16 +503,18 @@ def test_integration_test(self): generated_ids = model.generate(**inputs, max_new_tokens=10) generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) - # Batch affects generated text. Single batch output: ['In this image, we see the Statue of Liberty in the foreground and'] - expected_generated_text = "In this image, we see the Statue of Liberty, the New York City" + expected_generated_text = "In this image, we see the Statue of Liberty, which is located on Liberty" self.assertEqual(generated_texts[0], expected_generated_text) @slow @require_bitsandbytes + @require_torch_multi_gpu def test_integration_test_4bit(self): # Let' s make sure we test the preprocessing to replace what is used model = Idefics3ForConditionalGeneration.from_pretrained( - "HuggingFaceM4/Idefics3-8B-Llama3", load_in_4bit=True, device_map="auto" + "HuggingFaceM4/Idefics3-8B-Llama3", + load_in_4bit=True, + device_map="auto", ) # Create pixel inputs @@ -520,5 +525,5 @@ def test_integration_test_4bit(self): generated_ids = model.generate(**inputs, max_new_tokens=10) generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) - expected_generated_text = "In this image, we see the Statue of Liberty, the Hudson River," + expected_generated_text = "In this image, we see the Statue of Liberty, trees, buildings, water" self.assertEqual(generated_texts[0], expected_generated_text) diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py index d50d2d4eb91ef8..f4396b7c13d540 100644 --- a/tests/models/idefics3/test_processing_idefics3.py +++ b/tests/models/idefics3/test_processing_idefics3.py @@ -37,39 +37,39 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = Idefics3Processor - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", image_seq_len=2) - processor.save_pretrained(self.tmpdirname) - self.max_image_size = 364 - self.image1 = Image.open( + processor.save_pretrained(cls.tmpdirname) + cls.image1 = Image.open( BytesIO( requests.get( "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" ).content ) ) - self.image2 = Image.open( + cls.image2 = Image.open( BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content) ) - self.image3 = Image.open( + cls.image3 = Image.open( BytesIO( requests.get( "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" ).content ) ) - self.bos_token = processor.tokenizer.bos_token - self.image_token = processor.image_token.content - self.fake_image_token = processor.fake_image_token.content - self.global_img_token = processor.global_img_token - - self.bos_token_id = processor.tokenizer.convert_tokens_to_ids(self.bos_token) - self.image_token_id = processor.tokenizer.convert_tokens_to_ids(self.image_token) - self.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(self.fake_image_token) - self.global_img_token_id = processor.global_img_token_id - self.padding_token_id = processor.tokenizer.pad_token_id - self.image_seq_len = processor.image_seq_len + cls.bos_token = processor.tokenizer.bos_token + cls.image_token = processor.image_token.content + cls.fake_image_token = processor.fake_image_token.content + cls.global_img_token = processor.global_img_token + + cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token) + cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token) + cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token) + cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"] + cls.padding_token_id = processor.tokenizer.pad_token_id + cls.image_seq_len = processor.image_seq_len def get_tokenizer(self, **kwargs): return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer @@ -96,14 +96,15 @@ def get_splitted_image_expected_tokens(self, processor, image_rows, image_cols): ] # add double newline, as it gets its own token text_split_images += ( [self.fake_image_token_id] - + [self.global_img_token_id] + + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] ) return text_split_images - def tearDown(self): - shutil.rmtree(self.tmpdirname) + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname) def test_process_interleaved_images_prompts_no_image_splitting(self): processor = self.get_processor() @@ -124,7 +125,7 @@ def test_process_interleaved_images_prompts_no_image_splitting(self): # fmt: off tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False) - expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.global_img_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]] + expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]] self.assertEqual(inputs["input_ids"], expected_input_ids) self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 1092, 1456)) @@ -147,7 +148,7 @@ def test_process_interleaved_images_prompts_no_image_splitting(self): # fmt: off tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False) tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False) - image_tokens = [self.fake_image_token_id] + [self.global_img_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + image_tokens = [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] expected_input_ids_1 = [self.bos_token_id] + image_tokens + tokenized_sentence_1["input_ids"] expected_input_ids_2 = [self.bos_token_id] + 2 * image_tokens + tokenized_sentence_2["input_ids"] # Pad the first input to match the second input @@ -308,8 +309,10 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): def test_kwargs_overrides_default_image_processor_kwargs(self): if "image_processor" not in self.processor_class.attributes: self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor", max_image_size={"longest_edge": 80}) - tokenizer = self.get_component("tokenizer", max_length=117) + image_processor = self.get_component( + "image_processor", max_image_size={"longest_edge": 32}, size={"longest_edge": 32} + ) + tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) @@ -319,7 +322,8 @@ def test_kwargs_overrides_default_image_processor_kwargs(self): inputs = processor(text=input_str, images=image_input) self.assertEqual(len(inputs["pixel_values"][0][0]), 3) - self.assertEqual(len(inputs["pixel_values"][0][0][0]), 80) + self.assertEqual(len(inputs["pixel_values"][0][0][0]), 32) + self.assertEqual(len(inputs["input_ids"][0]), 117) # We need to overwrite this test to adapt it to our processor. @require_vision @@ -354,16 +358,16 @@ def test_structured_kwargs_nested(self): image_input = self.prepare_image_inputs() # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"max_image_size": {"longest_edge": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 120}, - } - - inputs = processor(text=input_str, images=image_input, **all_kwargs) + inputs = processor( + text=input_str, + images=image_input, + common_kwargs={"return_tensors": "pt"}, + images_kwargs={"max_image_size": {"longest_edge": 32}}, + text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"}, + ) self.skip_processor_without_typed_kwargs(processor) - self.assertEqual(inputs["pixel_values"].shape[3], 214) + self.assertEqual(inputs["pixel_values"].shape[3], 32) self.assertEqual(len(inputs["input_ids"][0]), 120) @@ -385,12 +389,12 @@ def test_structured_kwargs_nested_from_dict(self): # Define the kwargs for each modality all_kwargs = { "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"max_image_size": {"longest_edge": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 120}, + "images_kwargs": {"max_image_size": {"longest_edge": 32}}, + "text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"}, } inputs = processor(text=input_str, images=image_input, **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[3], 214) + self.assertEqual(inputs["pixel_values"].shape[3], 32) self.assertEqual(len(inputs["input_ids"][0]), 120) # We need to overwrite this test to adapt it to our processor. @@ -424,19 +428,19 @@ def test_unstructured_kwargs_batched(self): input_str = ["lower newer", "upper older longer string"] image_input = self.prepare_image_inputs() - print(image_input) inputs = processor( text=input_str, images=[image_input, image_input], return_tensors="pt", padding="longest", max_length=76, - max_image_size={"longest_edge": 214}, + truncation=True, + max_image_size={"longest_edge": 30}, ) self.assertEqual(inputs["pixel_values"].shape[2], 3) - self.assertEqual(inputs["pixel_values"].shape[3], 214) - self.assertEqual(len(inputs["input_ids"][0]), 88) + self.assertEqual(inputs["pixel_values"].shape[3], 30) + self.assertEqual(len(inputs["input_ids"][0]), 76) # We need to overwrite this test to adapt it to our processor. @require_torch @@ -456,10 +460,11 @@ def test_unstructured_kwargs(self): text=input_str, images=image_input, return_tensors="pt", - max_image_size={"longest_edge": 214}, + max_image_size={"longest_edge": 32}, padding="max_length", max_length=120, + truncation="longest_first", ) - self.assertEqual(inputs["pixel_values"].shape[3], 214) + self.assertEqual(inputs["pixel_values"].shape[3], 32) self.assertEqual(len(inputs["input_ids"][0]), 120) diff --git a/utils/check_repo.py b/utils/check_repo.py index 02570e3c60c3ef..819aafcadbf936 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -82,6 +82,7 @@ "SeamlessM4Tv2TextToUnitModel", "SeamlessM4Tv2CodeHifiGan", "SeamlessM4Tv2TextToUnitForConditionalGeneration", + "Idefics3VisionTransformer", ] # Update this list for models that are not tested with a comment explaining the reason it should not be.