Skip to content

Commit

Permalink
improve example
Browse files Browse the repository at this point in the history
  • Loading branch information
andimarafioti committed Aug 12, 2024
1 parent b5b033f commit 767c81d
Showing 1 changed file with 30 additions and 12 deletions.
42 changes: 30 additions & 12 deletions src/transformers/models/idefics3/modeling_idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -1119,25 +1119,43 @@ def forward(
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
>>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
>>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", device_map="auto")
>>> BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
>>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]
>>> model = AutoModelForVision2Seq.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3", torch_dtype=torch.bfloat16, device_map="auto")
>>> # Create inputs
>>> prompts = [
... "<image>In this image, we can see the city of New York, and more specifically the Statue of Liberty.<image>In this image,",
... "In which city is that bridge located?<image>",
... ]
>>> messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
{"type": "image"},
{"type": "text", "text": "What can we see in this image?"},
]
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "In which city is that bridge located?"},
]
}
]
>>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages]
>>> images = [[image1, image2], [image3]]
>>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to("cuda")
>>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device)
>>> # Generate
>>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=20)
>>> generated_ids = model.generate(**inputs, max_new_tokens=256)
>>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
>>> print(generated_texts)
['In this image, we can see the city of New York, and more specifically the Statue of Liberty. In this image, we can see the city of New York, and more specifically the Statue of Liberty.\n\n', 'In which city is that bridge located?\n\nThe bridge is located in the city of Pittsburgh, Pennsylvania.\n\n\nThe bridge is']
>>> print(generated_texts[0])
Assistant: There are buildings, trees, lights, and water visible in this image.
>>> print(generated_texts[1])
Assistant: The bridge is in San Francisco.
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
Expand Down

0 comments on commit 767c81d

Please sign in to comment.