16
16
Processor class for IDEFICS.
17
17
"""
18
18
19
- from typing import Callable , List , Optional , Union
19
+ from typing import Callable , Dict , List , Optional , Union
20
20
from urllib .parse import urlparse
21
21
22
22
from ...feature_extraction_utils import BatchFeature
23
- from ...processing_utils import ProcessorMixin
24
- from ...tokenization_utils_base import BatchEncoding , PaddingStrategy , TextInput , TruncationStrategy
23
+ from ...processing_utils import (
24
+ ImagesKwargs ,
25
+ ProcessingKwargs ,
26
+ ProcessorMixin ,
27
+ TextKwargs ,
28
+ Unpack ,
29
+ _validate_images_text_input_order ,
30
+ )
31
+ from ...tokenization_utils_base import PreTokenizedInput , TextInput
25
32
from ...utils import is_tf_available , is_torch_available
33
+ from ...utils .deprecation import deprecate_kwarg
26
34
27
35
28
36
if is_torch_available ():
34
42
IMAGE_TOKEN = "<image>"
35
43
36
44
45
+ class IdeficsImagesKwargs (ImagesKwargs , total = False ):
46
+ transform : Optional [Callable ]
47
+ image_size : Optional [Dict [str , int ]]
48
+ image_mean : Optional [Union [float , List [float ]]]
49
+ image_std : Optional [Union [float , List [float ]]]
50
+
51
+
52
+ class IdeficsTextKwargs (TextKwargs , total = False ):
53
+ add_eos_token : Optional [bool ]
54
+ add_end_of_utterance_token : Optional [bool ]
55
+
56
+
57
+ class IdeficsProcessorKwargs (ProcessingKwargs , total = False ):
58
+ text_kwargs : IdeficsTextKwargs
59
+ images_kwargs : IdeficsImagesKwargs
60
+ _defaults = {
61
+ "text_kwargs" : {
62
+ "add_special_tokens" : False ,
63
+ "padding" : "longest" ,
64
+ "add_eos_token" : False ,
65
+ },
66
+ "images_kwargs" : {},
67
+ "common_kwargs" : {"return_tensors" : "pt" },
68
+ }
69
+
70
+
37
71
# copied from m4.training.packing
38
72
def incremental_to_binary_attention_mask (incremental_mask , return_tensors , num_classes = - 1 ):
39
73
# Set elements >= num_classes to -1
@@ -199,52 +233,32 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u
199
233
else False
200
234
)
201
235
236
+ @deprecate_kwarg (old_name = "prompts" , version = "5.0.0" , new_name = "text" , raise_if_both_names = True )
202
237
def __call__ (
203
238
self ,
204
- prompts : Union [List [TextInput ], List [List [TextInput ]]],
205
- padding : Union [bool , str , PaddingStrategy ] = "longest" ,
206
- truncation : Union [bool , str , TruncationStrategy ] = None ,
207
- max_length : Optional [int ] = None ,
208
- transform : Callable = None ,
209
- add_eos_token = False ,
210
- add_end_of_utterance_token = None ,
211
- debug = False ,
212
- return_tensors = "pt" ,
213
- ) -> BatchEncoding :
239
+ images = None ,
240
+ text : Union [
241
+ TextInput ,
242
+ PreTokenizedInput ,
243
+ List [TextInput ],
244
+ List [PreTokenizedInput ],
245
+ List [List [TextInput ]],
246
+ List [List [PreTokenizedInput ]],
247
+ ] = None ,
248
+ audio = None ,
249
+ videos = None ,
250
+ ** kwargs : Unpack [IdeficsProcessorKwargs ],
251
+ ) -> BatchFeature :
214
252
"""This method takes batched or non-batched prompts made of text and images and converts them into prompts that
215
253
the model was trained on and prepares the image pixel values for the model to process.
216
254
217
255
Args:
218
- prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
256
+ images (`Union[PIL.Image, str, List[PIL.Image], List[str]]`):
257
+ either a single image or a batched list of images - can be passed in when text contains only text prompts,
258
+ in order to use the image-text-to-text behavior.
259
+ text (`Union[List[TextInput], [List[List[TextInput]]]]`):
219
260
either a single prompt or a batched list of prompts - see the detailed description immediately after
220
261
the end of the arguments doc section.
221
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `"longest"`):
222
- Select a strategy to pad the returned sequences (according to the model's padding side and padding
223
- index) among:
224
- - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
225
- sequence if provided).
226
- - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
227
- acceptable input length for the model if that argument is not provided.
228
- - `False` or `'do_not_pad'`: No padding. This will raise an error if the input sequences are of different
229
- lengths.
230
- Note: Unlike most processors, which set padding=`False` by default, `IdeficsProcessor` sets `padding="longest"`
231
- by default. See https://github.com/huggingface/transformers/pull/29449#pullrequestreview-1925576061 for why.
232
- max_length (`int`, *optional*):
233
- Maximum length of the returned list and optionally padding length (see above).
234
- truncation (`bool`, *optional*):
235
- Activates truncation to cut input sequences longer than `max_length` to `max_length`.
236
- transform (`Callable`, *optional*):
237
- A custom transform function that accepts a single image can be passed for training. For example,
238
- `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
239
- set of transforms will be applied to the images
240
- add_eos_token (`bool`, *optional*, defaults to `False`):
241
- Adds `eos_token` at the end of the final prompt if True`
242
- add_end_of_utterance_token (`bool`, *optional*)
243
- Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
244
- image). If `None` the tokenizer will be checked instead and if this token is found in
245
- `additional_special_tokens` then the value will be `True`.
246
- debug (`bool`, *optional*, defaults to `False`):
247
- `True` value will help debug prompt generation by dumping useful information
248
262
return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
249
263
The type of tensors to return. Can be one of:
250
264
- `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
@@ -255,7 +269,7 @@ def __call__(
255
269
256
270
Detailed explanation:
257
271
258
- Each entry in `prompts ` is either a text to be passed as is or an image that will be processed.
272
+ Each entry in `text ` is either a text to be passed as is or an image that will be processed.
259
273
260
274
An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
261
275
@@ -279,7 +293,7 @@ def __call__(
279
293
"Describe this image.\n Assistant:",
280
294
]
281
295
282
- inputs = processor(prompts, return_tensors="pt")
296
+ inputs = processor(text= prompts, return_tensors="pt")
283
297
generated_ids = model.generate(**inputs, max_length=100)
284
298
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
285
299
```
@@ -311,18 +325,55 @@ def __call__(
311
325
transforms.Normalize(mean=self.image_mean, std=self.image_std),
312
326
]
313
327
)
314
- inputs = processor(prompts, transform=image_transform, return_tensors="pt")
328
+ inputs = processor(text= prompts, transform=image_transform, return_tensors="pt")
315
329
```
316
330
317
331
In order to help debug prompt generation enable `debug=True` which will show you what's happening.
318
332
319
333
"""
334
+ if images is None and text is None :
335
+ raise ValueError ("You need to specify either `text` or `images` and `text`." )
336
+ # check if images and text inputs are reversed for BC
337
+ images , text = _validate_images_text_input_order (images , text )
338
+
339
+ if images is None :
340
+ # assuming the user wants to use the old behavior with prompts as the only argument
341
+ prompts = text
342
+ elif text is not None :
343
+ # Assuming image-text-to-text behavior:
344
+ # Check if batched images are provided
345
+ if not isinstance (images , (list , tuple )):
346
+ images = [images ]
347
+ if isinstance (text , str ):
348
+ text = [text ]
349
+ # Check if batched images and text are in the correct format
350
+ if isinstance (text , (list , tuple )) and len (text ) != len (images ):
351
+ raise ValueError (
352
+ "When providing both images and text arguments, the number of text prompts should be the same as the number of images."
353
+ "If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...]."
354
+ )
355
+ # Check that only text is present in the prompts
356
+ if not all (isinstance (i , str ) for i in text ):
357
+ raise ValueError ("When using the image-text-to-text behavior, the prompts should only contain text." )
358
+ if isinstance (images [0 ], (list , tuple )):
359
+ # if nested images, nest text as well
360
+ text = [[i ] for i in text ]
361
+ prompts = list (zip (images , text ))
362
+
363
+ output_kwargs = self ._merge_kwargs (
364
+ IdeficsProcessorKwargs ,
365
+ tokenizer_init_kwargs = self .tokenizer .init_kwargs ,
366
+ ** kwargs ,
367
+ )
368
+
369
+ add_eos_token = output_kwargs ["text_kwargs" ].pop ("add_eos_token" , False )
370
+ add_end_of_utterance_token = output_kwargs ["text_kwargs" ].pop ("add_end_of_utterance_token" , None )
320
371
321
372
# if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
322
373
if add_end_of_utterance_token is None :
323
374
add_end_of_utterance_token = self .tokenizer_was_trained_with_end_of_utterance_token
324
375
# turn non-batched prompts into batched
325
- if not any (isinstance (i , list ) for i in prompts ):
376
+ if not any (isinstance (i , ( list , tuple ) ) for i in prompts ):
326
377
prompts = [prompts ]
327
378
328
379
fake_token = "<fake_token_around_image>"
@@ -371,21 +422,14 @@ def image_tokens(last_was_image):
371
422
if add_eos_token :
372
423
full_text += self .tokenizer .eos_token
373
424
374
- if debug is True :
375
- print (f"{ full_text = } " )
376
-
377
- image_objects = self .image_processor (image_objects , transform = transform , return_tensors = return_tensors )
425
+ image_objects = self .image_processor (image_objects , ** output_kwargs ["images_kwargs" ])
378
426
379
427
all_prompts .append (full_text )
380
428
all_images .append (image_objects )
381
429
382
- text_encoding = self .tokenizer (
383
- text = all_prompts ,
384
- add_special_tokens = False ,
385
- padding = padding ,
386
- truncation = truncation ,
387
- max_length = max_length ,
388
- )
430
+ # For BC
431
+ return_tensors = output_kwargs ["text_kwargs" ].pop ("return_tensors" , "pt" )
432
+ text_encoding = self .tokenizer (all_prompts , ** output_kwargs ["text_kwargs" ])
389
433
all_texts = text_encoding ["input_ids" ]
390
434
all_attention_masks = text_encoding ["attention_mask" ]
391
435
@@ -398,12 +442,12 @@ def image_tokens(last_was_image):
398
442
output_images = []
399
443
output_attention_masks = []
400
444
401
- for text , attention_mask , images in zip (all_texts , all_attention_masks , all_images ):
402
- padded_input_ids = text
445
+ for text_single , attention_mask , extracted_images in zip (all_texts , all_attention_masks , all_images ):
446
+ padded_input_ids = text_single
403
447
image_count = padded_input_ids .count (self .image_token_id )
404
448
local_max_num_images = min (image_count , max_num_images )
405
449
406
- current_images = images [:local_max_num_images ]
450
+ current_images = extracted_images [:local_max_num_images ]
407
451
408
452
if len (current_images ) > 0 :
409
453
if return_tensors == "pt" :
0 commit comments