diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index 121c1e2aa95442..cd9f1095917ec3 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -169,7 +169,7 @@ def __call__( return encoding else: return BatchEncoding( - data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"]["return_tensors"] + data=dict(**image_features), tensor_type=output_kwargs["common_kwargs"].get("return_tensors") ) def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index 1ec03dd661420b..b233db4a83710d 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -150,7 +150,7 @@ def __call__( ) # cast to desired return tensors type after concatenating - text_encoding = BatchEncoding(text_encoding, tensor_type=output_kwargs["common_kwargs"]["return_tensors"]) + text_encoding = BatchEncoding(text_encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) encoding.update(text_encoding) qformer_text_encoding = self.qformer_tokenizer(text=text, **output_kwargs["text_kwargs"]) encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids") diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index 0f605ac3f6adc3..6d7d68048abc91 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -42,7 +42,7 @@ class Owlv2ProcessorKwargs(ProcessingKwargs, total=False): images_kwargs: Owlv2ImagesKwargs _defaults = { "text_kwargs": { - "padding": False, + "padding": "max_length", }, "common_kwargs": { "return_tensors": "np", @@ -70,7 +70,6 @@ class Owlv2Processor(ProcessorMixin): def __init__(self, image_processor, tokenizer, **kwargs): super().__init__(image_processor, tokenizer) - # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OWLViT->OWLv2 def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index 7a512f2d1d7e02..c81a1ff743e6de 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -42,7 +42,7 @@ class OwlViTProcessorKwargs(ProcessingKwargs, total=False): images_kwargs: OwlViTImagesKwargs _defaults = { "text_kwargs": { - "padding": False, + "padding": "max_length", }, "common_kwargs": { "return_tensors": "np",