From 767c81d0533a2bf1bd44b284382d8c03d132ad62 Mon Sep 17 00:00:00 2001 From: Andres Marafioti Date: Mon, 12 Aug 2024 11:45:24 +0000 Subject: [PATCH] improve example --- .../models/idefics3/modeling_idefics3.py | 42 +++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/idefics3/modeling_idefics3.py b/src/transformers/models/idefics3/modeling_idefics3.py index f7474a42e014d8..9ebf962b2a746a 100644 --- a/src/transformers/models/idefics3/modeling_idefics3.py +++ b/src/transformers/models/idefics3/modeling_idefics3.py @@ -1119,25 +1119,43 @@ def forward( >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg") >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3") - >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", device_map="auto") - - >>> BAD_WORDS_IDS = processor.tokenizer(["", ""], add_special_tokens=False).input_ids - >>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id] + >>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", torch_dtype=torch.bfloat16, device_map="auto") >>> # Create inputs - >>> prompts = [ - ... "In this image, we can see the city of New York, and more specifically the Statue of Liberty.In this image,", - ... "In which city is that bridge located?", - ... ] + >>> messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."}, + {"type": "image"}, + {"type": "text", "text": "What can we see in this image?"}, + ] + }, + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "In which city is that bridge located?"}, + ] + } + + ] + + + >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages] >>> images = [[image1, image2], [image3]] - >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to("cuda") + >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device) >>> # Generate - >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20) + >>> generated_ids = model.generate(**inputs, max_new_tokens=256) >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) - >>> print(generated_texts) - ['In this image, we can see the city of New York, and more specifically the Statue of Liberty. In this image, we can see the city of New York, and more specifically the Statue of Liberty.\n\n', 'In which city is that bridge located?\n\nThe bridge is located in the city of Pittsburgh, Pennsylvania.\n\n\nThe bridge is'] + >>> print(generated_texts[0]) + Assistant: There are buildings, trees, lights, and water visible in this image. + + >>> print(generated_texts[1]) + Assistant: The bridge is in San Francisco. ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (