From 7d93bb5b4bacf8bc5e66233a50d837ca953f8c71 Mon Sep 17 00:00:00 2001
From: Niels <niels.rogge1@gmail.com>
Date: Wed, 28 Aug 2024 18:28:30 +0200
Subject: [PATCH] First draft

---
 .../blip_2/convert_blip_2_original_to_pytorch.py     |  2 ++
 .../pipelines/zero_shot_image_classification.py      | 12 ++----------
 .../test_pipelines_zero_shot_image_classification.py |  2 --
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
index 5f972353c4f41e..f571be20e13577 100644
--- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
+++ b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
@@ -155,6 +155,8 @@ def convert_blip2_checkpoint(
     else:
         tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
 
+    tokenizer.model_input_names = ["input_ids", "attention_mask"]
+
     if "itm" in model_name:
         eos_token_id = None
     else:
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index b7e13e782e78cc..210ef4a7e67827 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -97,9 +97,6 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
                 The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                 the call may block forever.
 
-            tokenizer_kwargs (`dict`, *optional*):
-                Additional dictionary of keyword arguments passed along to the tokenizer.
-
         Return:
             A list of dictionaries containing one entry per proposed label. Each dictionary contains the
             following keys:
@@ -109,7 +106,7 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar
         """
         return super().__call__(images, **kwargs)
 
-    def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs):
+    def _sanitize_parameters(self, **kwargs):
         preprocess_params = {}
         if "candidate_labels" in kwargs:
             preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
@@ -117,8 +114,6 @@ def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs):
             preprocess_params["timeout"] = kwargs["timeout"]
         if "hypothesis_template" in kwargs:
             preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
-        if tokenizer_kwargs is not None:
-            preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
 
         return preprocess_params, {}, {}
 
@@ -128,10 +123,7 @@ def preprocess(
         candidate_labels=None,
         hypothesis_template="This is a photo of {}.",
         timeout=None,
-        tokenizer_kwargs=None,
     ):
-        if tokenizer_kwargs is None:
-            tokenizer_kwargs = {}
         image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors=self.framework)
         if self.framework == "pt":
@@ -139,7 +131,7 @@ def preprocess(
         inputs["candidate_labels"] = candidate_labels
         sequences = [hypothesis_template.format(x) for x in candidate_labels]
         padding = "max_length" if self.model.config.model_type == "siglip" else True
-        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=padding, **tokenizer_kwargs)
+        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=padding)
         inputs["text_inputs"] = [text_inputs]
         return inputs
 
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
index b57adf609d1e09..660d089c4b8180 100644
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -292,7 +292,6 @@ def test_blip2_model_pt(self):
         output = image_classifier(
             image,
             candidate_labels=["2 cats", "a plane", "a remote"],
-            tokenizer_kwargs={"return_token_type_ids": False},
         )
 
         self.assertEqual(
@@ -308,7 +307,6 @@ def test_blip2_model_pt(self):
             [image] * 5,
             candidate_labels=["2 cats", "a plane", "a remote"],
             batch_size=2,
-            tokenizer_kwargs={"return_token_type_ids": False},
         )
 
         self.assertEqual(