fixes tests

huggingface · Aug 15, 2024 · f67ed1e · f67ed1e
1 parent 1f11f95
commit f67ed1e
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 10 deletions.
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
@@ -16,9 +16,9 @@
 Processor class for Idefics3.
 """
 
+import re
 import sys
 from typing import TYPE_CHECKING, List, Optional, Union
-import re
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
@@ -147,7 +147,7 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 169, ch
         self.global_img_token = "<global-img>"
         self.image_seq_len = image_seq_len
 
-        self._regex_to_remove_extra_special_tokens = re.compile(r'(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+')
+        self._regex_to_remove_extra_special_tokens = re.compile(r"(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+")
 
         tokens_to_add = {
             "additional_special_tokens": [
@@ -356,7 +356,6 @@ def decode(self, *args, **kwargs):
         decode_output = self.tokenizer.decode(*args, **kwargs)
         return self._regex_to_remove_extra_special_tokens.sub("<image>", decode_output)
 
-
     @property
     def model_input_names(self):
         tokenizer_input_names = self.tokenizer.model_input_names

diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
@@ -501,7 +501,7 @@ def test_integration_test(self):
         generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
 
         # Batch affects generated text. Single batch output: ['In this image, we see the Statue of Liberty in the foreground and']
-        expected_generated_text = "In this image, we see the Statue of Liberty, the New York City"
+        expected_generated_text = "<image>In this image, we see the Statue of Liberty, which is located on Liberty"
         self.assertEqual(generated_texts[0], expected_generated_text)
 
     @slow
@@ -520,5 +520,5 @@ def test_integration_test_4bit(self):
         generated_ids = model.generate(**inputs, max_new_tokens=10)
         generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
 
-        expected_generated_text = "In this image, we see the Statue of Liberty, the Hudson River,"
+        expected_generated_text = "<image>In this image, we see the Statue of Liberty, which is located on Liberty"
         self.assertEqual(generated_texts[0], expected_generated_text)
diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py
@@ -67,7 +67,7 @@ def setUp(self):
         self.bos_token_id = processor.tokenizer.convert_tokens_to_ids(self.bos_token)
         self.image_token_id = processor.tokenizer.convert_tokens_to_ids(self.image_token)
         self.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(self.fake_image_token)
-        self.global_img_token_id = processor.global_img_token_id
+        self.global_img_tokens_id = processor.tokenizer(self.global_img_token, add_special_tokens=False)["input_ids"]
         self.padding_token_id = processor.tokenizer.pad_token_id
         self.image_seq_len = processor.image_seq_len
 
@@ -96,7 +96,7 @@ def get_splitted_image_expected_tokens(self, processor, image_rows, image_cols):
         ]  # add double newline, as it gets its own token
         text_split_images += (
             [self.fake_image_token_id]
-            + [self.global_img_token_id]
+            + self.global_img_tokens_id
             + [self.image_token_id] * self.image_seq_len
             + [self.fake_image_token_id]
         )
@@ -124,7 +124,7 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
 
         # fmt: off
         tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
-        expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.global_img_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
+        expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]]
         self.assertEqual(inputs["input_ids"], expected_input_ids)
         self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
         self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 1092, 1456))
@@ -147,7 +147,7 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         # fmt: off
         tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False)
         tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False)
-        image_tokens = [self.fake_image_token_id] + [self.global_img_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
+        image_tokens = [self.fake_image_token_id] + self.global_img_tokens_id + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id]
         expected_input_ids_1 = [self.bos_token_id] + image_tokens + tokenized_sentence_1["input_ids"]
         expected_input_ids_2 = [self.bos_token_id] + 2 * image_tokens + tokenized_sentence_2["input_ids"]
         # Pad the first input to match the second input
@@ -436,7 +436,7 @@ def test_unstructured_kwargs_batched(self):
 
         self.assertEqual(inputs["pixel_values"].shape[2], 3)
         self.assertEqual(inputs["pixel_values"].shape[3], 214)
-        self.assertEqual(len(inputs["input_ids"][0]), 88)
+        self.assertEqual(len(inputs["input_ids"][0]), 91)
 
     # We need to overwrite this test to adapt it to our processor.
     @require_torch