Skip to content

Commit

Permalink
phi-3 vision notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova committed Jul 16, 2024
1 parent 173c19a commit fa40b9e
Show file tree
Hide file tree
Showing 4 changed files with 1,062 additions and 0 deletions.
21 changes: 21 additions & 0 deletions notebooks/phi-3-vision/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
## Visual-language assistant with Phi3-Vision and OpenVINO

The [Phi-3-Vision-128K-Instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures. More details about model can be found in [model blog post](https://azure.microsoft.com/en-us/blog/new-models-added-to-the-phi-3-family-available-on-microsoft-azure/), [technical report](https://aka.ms/phi3-tech-report), [Phi-3-cookbook](https://github.com/microsoft/Phi-3CookBook)

In this tutorial we consider how to launch Phi-3-vision using OpenVINO for creation multimodal chatbot. Additionally, we optimize model to low precision using [NNCF](https://github.com/openvinotoolkit/nncf)

## Notebook contents
The tutorial consists from following steps:

- Install requirements
- Convert and Optimize model
- Run OpenVINO model inference
- Launch Interactive demo

In this demonstration, you'll create interactive chatbot that can answer questions about provided image's content.


## Installation instructions
This is a self-contained example that relies solely on its own code.</br>
We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
For details, please refer to [Installation Guide](../../README.md).
113 changes: 113 additions & 0 deletions notebooks/phi-3-vision/gradio_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from pathlib import Path
import requests
import gradio as gr
from PIL import Image
from threading import Thread
from transformers import TextIteratorStreamer


def make_demo(model, processor):
example_image_urls = [
(
"https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/1d6a0188-5613-418d-a1fd-4560aae1d907",
"bee.jpg",
),
(
"https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/6cc7feeb-0721-4b5d-8791-2576ed9d2863",
"baklava.png",
),
("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/dd5105d6-6a64-4935-8a34-3058a82c8d5d", "small.png"),
("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/1221e2a8-a6da-413a-9af6-f04d56af3754", "chart.png"),
]

for url, file_name in example_image_urls:
if not Path(file_name).exists():
Image.open(requests.get(url, stream=True).raw).save(file_name)

def bot_streaming(message, history):
print(f"message is - {message}")
print(f"history is - {history}")
if message["files"]:
# message["files"][-1] is a Dict or just a string
if type(message["files"][-1]) == dict:
image = message["files"][-1]["path"]
else:
image = message["files"][-1]
else:
# if there's no image uploaded for this turn, look for images in the past turns
# kept inside tuples, take the last one
for hist in history:
if type(hist[0]) == tuple:
image = hist[0][0]
try:
if image is None:
# Handle the case where image is None
raise gr.Error("You need to upload an image for Phi3-Vision to work. Close the error and try again with an Image.")
except NameError:
# Handle the case where 'image' is not defined at all
raise gr.Error("You need to upload an image for Phi3-Vision to work. Close the error and try again with an Image.")

conversation = []
flag = False
for user, assistant in history:
if assistant is None:
# pass
flag = True
conversation.extend([{"role": "user", "content": ""}])
continue
if flag == True:
conversation[0]["content"] = f"<|image_1|>\n{user}"
conversation.extend([{"role": "assistant", "content": assistant}])
flag = False
continue
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])

if len(history) == 0:
conversation.append({"role": "user", "content": f"<|image_1|>\n{message['text']}"})
else:
conversation.append({"role": "user", "content": message["text"]})
print(f"prompt is -\n{conversation}")
prompt = processor.tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
image = Image.open(image)
inputs = processor(prompt, image, return_tensors="pt")

streamer = TextIteratorStreamer(
processor,
**{
"skip_special_tokens": True,
"skip_prompt": True,
"clean_up_tokenization_spaces": False,
},
)
generation_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=1024,
do_sample=False,
temperature=0.0,
eos_token_id=processor.tokenizer.eos_token_id,
)

thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer

demo = gr.ChatInterface(
fn=bot_streaming,
title="Phi3 Vision 128K Instruct with OpenVINO",
examples=[
{"text": "What is on the flower?", "files": ["./bee.jpg"]},
{"text": "How to make this pastry?", "files": ["./baklava.png"]},
{"text": "What is the text saying?", "files": ["./small.png"]},
{"text": "What does the chart display?", "files": ["./chart.png"]},
],
description="Try the [Phi3-Vision model](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) from Microsoft wiht OpenVINO. Upload an image and start chatting about it, or simply try one of the examples below. If you won't upload an image, you will receive an error.",
stop_btn="Stop Generation",
multimodal=True,
)

return demo
Loading

0 comments on commit fa40b9e

Please sign in to comment.