phi-3 vision notebook

openvinotoolkit · Jul 16, 2024 · fa40b9e · fa40b9e
1 parent 173c19a
commit fa40b9e
Show file tree

Hide file tree

Showing 4 changed files with 1,062 additions and 0 deletions.
diff --git a/notebooks/phi-3-vision/README.md b/notebooks/phi-3-vision/README.md
@@ -0,0 +1,21 @@
+## Visual-language assistant with Phi3-Vision and OpenVINO
+
+The [Phi-3-Vision-128K-Instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures. More details about model can be found in [model blog post](https://azure.microsoft.com/en-us/blog/new-models-added-to-the-phi-3-family-available-on-microsoft-azure/), [technical report](https://aka.ms/phi3-tech-report), [Phi-3-cookbook](https://github.com/microsoft/Phi-3CookBook)
+
+In this tutorial we consider how to launch Phi-3-vision using OpenVINO for creation multimodal chatbot. Additionally, we optimize model to low precision using [NNCF](https://github.com/openvinotoolkit/nncf)
+
+## Notebook contents
+The tutorial consists from following steps:
+
+- Install requirements
+- Convert and Optimize model
+- Run OpenVINO model inference
+- Launch Interactive demo
+
+In this demonstration, you'll create interactive chatbot that can answer questions about provided image's content.
+
+
+## Installation instructions
+This is a self-contained example that relies solely on its own code.</br>
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
diff --git a/notebooks/phi-3-vision/gradio_helper.py b/notebooks/phi-3-vision/gradio_helper.py
@@ -0,0 +1,113 @@
+from pathlib import Path
+import requests
+import gradio as gr
+from PIL import Image
+from threading import Thread
+from transformers import TextIteratorStreamer
+
+
+def make_demo(model, processor):
+    example_image_urls = [
+        (
+            "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/1d6a0188-5613-418d-a1fd-4560aae1d907",
+            "bee.jpg",
+        ),
+        (
+            "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/6cc7feeb-0721-4b5d-8791-2576ed9d2863",
+            "baklava.png",
+        ),
+        ("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/dd5105d6-6a64-4935-8a34-3058a82c8d5d", "small.png"),
+        ("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/1221e2a8-a6da-413a-9af6-f04d56af3754", "chart.png"),
+    ]
+
+    for url, file_name in example_image_urls:
+        if not Path(file_name).exists():
+            Image.open(requests.get(url, stream=True).raw).save(file_name)
+
+    def bot_streaming(message, history):
+        print(f"message is - {message}")
+        print(f"history is - {history}")
+        if message["files"]:
+            # message["files"][-1] is a Dict or just a string
+            if type(message["files"][-1]) == dict:
+                image = message["files"][-1]["path"]
+            else:
+                image = message["files"][-1]
+        else:
+            # if there's no image uploaded for this turn, look for images in the past turns
+            # kept inside tuples, take the last one
+            for hist in history:
+                if type(hist[0]) == tuple:
+                    image = hist[0][0]
+        try:
+            if image is None:
+                # Handle the case where image is None
+                raise gr.Error("You need to upload an image for Phi3-Vision to work. Close the error and try again with an Image.")
+        except NameError:
+            # Handle the case where 'image' is not defined at all
+            raise gr.Error("You need to upload an image for Phi3-Vision to work. Close the error and try again with an Image.")
+
+        conversation = []
+        flag = False
+        for user, assistant in history:
+            if assistant is None:
+                # pass
+                flag = True
+                conversation.extend([{"role": "user", "content": ""}])
+                continue
+            if flag == True:
+                conversation[0]["content"] = f"<|image_1|>\n{user}"
+                conversation.extend([{"role": "assistant", "content": assistant}])
+                flag = False
+                continue
+            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+
+        if len(history) == 0:
+            conversation.append({"role": "user", "content": f"<|image_1|>\n{message['text']}"})
+        else:
+            conversation.append({"role": "user", "content": message["text"]})
+        print(f"prompt is -\n{conversation}")
+        prompt = processor.tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+        image = Image.open(image)
+        inputs = processor(prompt, image, return_tensors="pt")
+
+        streamer = TextIteratorStreamer(
+            processor,
+            **{
+                "skip_special_tokens": True,
+                "skip_prompt": True,
+                "clean_up_tokenization_spaces": False,
+            },
+        )
+        generation_kwargs = dict(
+            inputs,
+            streamer=streamer,
+            max_new_tokens=1024,
+            do_sample=False,
+            temperature=0.0,
+            eos_token_id=processor.tokenizer.eos_token_id,
+        )
+
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            yield buffer
+
+    demo = gr.ChatInterface(
+        fn=bot_streaming,
+        title="Phi3 Vision 128K Instruct with OpenVINO",
+        examples=[
+            {"text": "What is on the flower?", "files": ["./bee.jpg"]},
+            {"text": "How to make this pastry?", "files": ["./baklava.png"]},
+            {"text": "What is the text saying?", "files": ["./small.png"]},
+            {"text": "What does the chart display?", "files": ["./chart.png"]},
+        ],
+        description="Try the [Phi3-Vision model](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) from Microsoft wiht OpenVINO. Upload an image and start chatting about it, or simply try one of the examples below. If you won't upload an image, you will receive an error.",
+        stop_btn="Stop Generation",
+        multimodal=True,
+    )
+
+    return demo