diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index b233db4a83710d..c38edc2048549f 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -150,7 +150,9 @@ def __call__( ) # cast to desired return tensors type after concatenating - text_encoding = BatchEncoding(text_encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors")) + text_encoding = BatchEncoding( + text_encoding, tensor_type=output_kwargs["common_kwargs"].get("return_tensors") + ) encoding.update(text_encoding) qformer_text_encoding = self.qformer_tokenizer(text=text, **output_kwargs["text_kwargs"]) encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")