From 93919ca95bc35bdd4528bbf366fb40339cc1af2e Mon Sep 17 00:00:00 2001
From: morteza heidari <m67.heidari@gmail.com>
Date: Mon, 10 Feb 2025 11:51:26 -0500
Subject: [PATCH] Added new examples for DeepSeek Models

---
 .../example/GPU/DeepSeek-Chatbot/README.md    |  76 ++++
 .../GPU/DeepSeek-Chatbot/deepseek_chatbot.py  | 389 ++++++++++++++++++
 .../GPU/DeepSeek-Chatbot/run_chatbot.sh       |  11 +
 3 files changed, 476 insertions(+)
 create mode 100644 python/llm/example/GPU/DeepSeek-Chatbot/README.md
 create mode 100644 python/llm/example/GPU/DeepSeek-Chatbot/deepseek_chatbot.py
 create mode 100644 python/llm/example/GPU/DeepSeek-Chatbot/run_chatbot.sh

diff --git a/python/llm/example/GPU/DeepSeek-Chatbot/README.md b/python/llm/example/GPU/DeepSeek-Chatbot/README.md
new file mode 100644
index 00000000000..d31dead4a8d
--- /dev/null
+++ b/python/llm/example/GPU/DeepSeek-Chatbot/README.md
@@ -0,0 +1,76 @@
+# DeepSeek Chatbot
+
+This is a hybrid CPU+XPU-enabled chatbot powered by DeepSeek models, optimized using the Intel IPEX-LLM library for efficient and fast inference on Intel AI PC systems. The chatbot supports large-model inference, document-based querying, and summarization capabilities. Users can easily toggle between different computation devices (CPU, XPU, or Hybrid) and select from various pre-trained DeepSeek models.
+
+## Features
+
+Model Selection: Supports multiple pre-trained DeepSeek models, including chat and base variants.
+Device Flexibility: Offers CPU-only, GPU (XPU)-only, and hybrid CPU+XPU modes for optimal performance on Intel AI PCs.
+Document Upload: Accepts PDF and Word documents to enable retrieval-augmented generation (RAG) for enhanced question answering.
+Conversation Summarization: Summarizes long conversations to keep the chatbot's memory efficient and concise.
+Interactive UI: Intuitive browser-based interface with separate tabs for chatting, document upload, and settings configuration.
+
+## Installation
+
+Requirements
+Before running the chatbot, ensure you have the necessary dependencies installed:
+
+Operating System: Compatible with Intel AI PCs running Linux or Windows.
+Python Environment: Create and activate a new Conda environment with Python 3.11 and required libraries.
+Steps to Install IPEX-LLM for Intel GPUs
+Run the following commands in your terminal:
+
+conda create -n llm python=3.11 libuv -y
+conda activate llm
+pip install --pre --upgrade ipex-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu
+
+## Running the Chatbot
+
+Steps to Launch:
+Activate Environment:
+conda activate llm
+Navigate to Directory: Go to the directory containing the deepseek_chatbot.py file:
+cd /path/to/DeepSeek-Chatbot
+Start the Chatbot: Run the chatbot application:
+python deepseek_chatbot.py
+Access in Browser: Open your browser and navigate to:
+http://127.0.0.1:7860/
+
+## Features Overview
+
+1. Chat Interface (Tab 1)
+   Input questions or commands in the textbox.
+   Suggested questions:
+   "What is a GPU?"
+   "What are the differences between CPU and GPU?"
+   "Summarize the uploaded document."
+2. Document Upload (Tab 2)
+   Upload PDF or Word documents to allow the chatbot to extract and answer questions based on the document's content.
+   Supported file formats: .pdf, .doc, .docx.
+3. Settings (Tab 3)
+   All models will be quantized to 4 bits.
+   Select Model: Choose from pre-trained DeepSeek models, such as:
+   deepseek-ai/deepseek-llm-7b-chat
+   deepseek-ai/deepseek-llm-7b-base
+   deepseek-ai/DeepSeek-R1-Distill-Qwen-14B (For this one you need to have enough RAM)
+
+## Select Device:
+
+User friendly to easily toggle between CPU and GPU.
+CPU: Use the CPU for all computations.
+XPU: Use the Intel GPU for inference.
+CPU+XPU: Hybrid mode for large models, with embeddings on CPU and main layers on GPU.
+
+## Known Limitations
+
+Ensure the uploaded document's content is extractable (some scanned PDFs may not work).
+Hybrid mode is optimized for large models but may require sufficient system memory.
+
+## Additional Notes
+
+For troubleshooting, ensure that the IPEX-LLM package is installed correctly and that your system supports Intel GPUs.
+If the UI is not accessible at 127.0.0.1:7860, verify that the server is running and check for firewall restrictions.
+
+## Future Enhancements:
+
+This is an initial version demonstrating how easily IPEX-LLM enables running large language models on Intel AI PCs. Future updates will include the addition of more advanced models and enhanced Retrieval-Augmented Generation (RAG) capabilities (this current version is a naive retrieve snippet) to further improve the chatbot's performance and versatility. Stay tuned!
diff --git a/python/llm/example/GPU/DeepSeek-Chatbot/deepseek_chatbot.py b/python/llm/example/GPU/DeepSeek-Chatbot/deepseek_chatbot.py
new file mode 100644
index 00000000000..9ccca3379e8
--- /dev/null
+++ b/python/llm/example/GPU/DeepSeek-Chatbot/deepseek_chatbot.py
@@ -0,0 +1,389 @@
+import os
+import torch
+import gradio as gr
+from threading import Thread
+
+from ipex_llm.transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer, TextIteratorStreamer
+
+try:
+    import PyPDF2
+except ImportError:
+    PyPDF2 = None
+
+try:
+    import docx2txt
+except ImportError:
+    docx2txt = None
+
+############################################################################
+# 1) Global config
+############################################################################
+MODEL_OPTIONS = [
+    "deepseek-ai/deepseek-llm-7b-chat",
+    "deepseek-ai/deepseek-llm-7b-base",
+    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
+]
+current_model_name = MODEL_OPTIONS[0]
+
+model_in_4bit = None
+model_in_4bit_gpu = None
+tokenizer = None
+
+use_xpu = False
+use_hybrid = False  # for CPU + XPU usage
+
+DOC_TEXT = ""
+MAX_TURNS_BEFORE_SUMMARY = 6
+
+
+############################################################################
+# 2) Document Parsing
+############################################################################
+def parse_uploaded_file(file_obj) -> str:
+    global DOC_TEXT
+    if file_obj is None:
+        DOC_TEXT = ""
+        return "No file uploaded."
+
+    file_path = file_obj.name
+    file_ext = os.path.splitext(file_path)[1].lower()
+    extracted_text = ""
+
+    if file_ext == ".pdf" and PyPDF2 is not None:
+        try:
+            with open(file_path, "rb") as f:
+                reader = PyPDF2.PdfReader(f)
+                for page_num in range(len(reader.pages)):
+                    page_text = reader.pages[page_num].extract_text() or ""
+                    extracted_text += page_text
+        except Exception as e:
+            extracted_text = f"Error reading PDF: {e}"
+    elif file_ext in [".doc", ".docx"] and docx2txt is not None:
+        try:
+            extracted_text = docx2txt.process(file_path) or ""
+        except Exception as e:
+            extracted_text = f"Error reading Word: {e}"
+    else:
+        extracted_text = f"Unsupported file extension or missing libraries for {file_ext}"
+
+    DOC_TEXT = extracted_text.strip()
+    if not DOC_TEXT or "Error reading" in extracted_text:
+        return f"Document text is empty or invalid.\nExtracted: {extracted_text[:300]}"
+    else:
+        snippet = DOC_TEXT[:300].replace("\n", " ")
+        return (
+            f"Successfully parsed document. Text length = {len(DOC_TEXT)}.\n"
+            f"Preview:\n{snippet}..."
+        )
+
+############################################################################
+# 3) Retrieval logic, Summarization, etc.
+############################################################################
+
+
+def naive_retrieve_snippet(query: str, doc_text: str, window=300) -> str:
+    query_lower = query.lower()
+    text_lower = doc_text.lower()
+    idx = text_lower.find(query_lower)
+    if idx == -1:
+        return ""
+    else:
+        start = max(0, idx - window // 2)
+        end = min(len(doc_text), idx + window // 2)
+        return doc_text[start:end]
+
+
+def summarize_history(history, model, tok):
+    """
+    Summarize the entire conversation so far into a concise 'memory.'
+    We'll call the same model for a quick summary.
+    """
+    # Convert history into a text block
+    conversation_text = []
+    for user_msg, bot_msg in history:
+        conversation_text.append(f"User: {user_msg}")
+        conversation_text.append(f"Assistant: {bot_msg if bot_msg else ''}")
+    convo_str = "\n".join(conversation_text)
+
+    # Build a simple prompt for summarizing
+    summary_prompt = (
+        "System: Please summarize the following conversation in 2-3 sentences. "
+        "Focus on key details, topics, or questions.\n\n"
+        f"Conversation:\n{convo_str}\n\nSummary:"
+    )
+
+    # We'll do a blocking generate call with minimal tokens. No streaming needed.
+    input_ids = tok([summary_prompt], return_tensors='pt')
+    if use_xpu or use_hybrid:
+        input_ids = input_ids.to('xpu')
+
+    # We'll do a short generation (max_new_tokens ~ 64)
+    output = model.generate(
+        **input_ids,
+        max_new_tokens=64,
+        do_sample=True,
+        temperature=0.5
+    )
+
+    summary_text = tok.decode(
+        output[0][input_ids["input_ids"].shape[1]:], skip_special_tokens=True)
+    # Clean up the summary if needed
+    return summary_text.strip()
+
+############################################################################
+# 4) System prompt & conversation format
+############################################################################
+
+
+def format_prompt(user_query, retrieved_context, chat_history):
+    system_text = (
+        "System: You DO have access to the user's document text. It is appended below. "
+        "Use it to answer questions about the document. Please do not say you have no file access. "
+        "If the user wants a summary, you can provide one. Always answer in English.\n\n"
+
+        "## Few-Shot Example 1 (Multi-turn style)\n"
+        "User: Hello, how are you?\n"
+        "Assistant: Hello! I'm doing well, thank you. How can I assist you today?\n\n"
+        "User: Can you remember what we discussed about GPUs?\n"
+        "Assistant: Yes, you asked about the definition of a GPU previously, and I explained that it's "
+        "a specialized processor designed for parallel computations.\n\n"
+
+        "## Few-Shot Example 2 (Referring to doc snippet)\n"
+        "User: I uploaded a document. Could you summarize it?\n"
+        "Assistant: Certainly! Let me look at the document text appended. "
+        "According to the snippet, it discusses advanced techniques in machine learning...\n\n"
+        "User: Thanks. And can you highlight the part about regularization?\n"
+        "Assistant: Sure! The doc states that dropout and weight decay are two forms of regularization...\n\n"
+
+        "## End of Few-Shot Examples\n\n"
+    )
+
+    if retrieved_context.strip():
+        system_text += f"Document snippet or full text:\n{retrieved_context}\n\n"
+    else:
+        system_text += "No relevant snippet found or doc is empty.\n\n"
+
+    system_text += "Now continue the actual conversation:\n\n"
+
+    prompt = [system_text]
+
+    for user_msg, bot_msg in chat_history:
+        if bot_msg is not None:
+            prompt.append(f"User: {user_msg}\nAssistant: {bot_msg}\n\n")
+        else:
+            prompt.append(f"User: {user_msg}\nAssistant:")
+
+    prompt.append(f"User: {user_query}\nAssistant:")
+    return "".join(prompt)
+
+############################################################################
+# 5) Model loading & CPU+XPU logic
+############################################################################
+
+
+def load_model(model_name, cpu_embedding=False):
+    """
+    If cpu_embedding=True, we pass that param so that large embeddings
+    can live on CPU, with the rest on XPU.
+    """
+    print(
+        f"[*] Loading model: {model_name} with cpu_embedding={cpu_embedding}")
+    m = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=model_name,
+        load_in_4bit=True,
+        cpu_embedding=cpu_embedding  # custom flag from ipex_llm.transformers if supported
+    )
+    t = AutoTokenizer.from_pretrained(model_name)
+    return m, t
+
+
+def set_model(model_name):
+    global model_in_4bit, model_in_4bit_gpu, tokenizer, use_xpu, use_hybrid, current_model_name
+    current_model_name = model_name
+
+    # decide if we want cpu_embedding
+    if use_hybrid:
+        model_in_4bit, tokenizer_ = load_model(model_name, cpu_embedding=True)
+    else:
+        model_in_4bit, tokenizer_ = load_model(model_name, cpu_embedding=False)
+    tokenizer = tokenizer_
+
+    if use_xpu or use_hybrid:
+        # Move the rest to XPU
+        model_in_4bit_gpu = model_in_4bit.to("xpu")
+    else:
+        # CPU only
+        model_in_4bit_gpu = model_in_4bit.to("cpu")
+
+    return f"✅ Model loaded: {model_name}, hybrid={use_hybrid}"
+
+
+def set_device(device_choice):
+    """
+    device_choice can be 'CPU', 'XPU', or 'CPU+XPU'
+    """
+    global use_xpu, use_hybrid, model_in_4bit_gpu, model_in_4bit
+
+    if device_choice == "CPU":
+        use_xpu = False
+        use_hybrid = False
+        model_in_4bit_gpu = model_in_4bit.to("cpu")
+        return "✅ Using CPU only."
+    elif device_choice == "XPU":
+        use_xpu = True
+        use_hybrid = False
+        model_in_4bit_gpu = model_in_4bit.to("xpu")
+        return "✅ Using XPU (Intel GPU)."
+    else:  # "CPU+XPU"
+        use_xpu = True
+        use_hybrid = True
+        # We'll reload the model with cpu_embedding=True
+        # to place large embeddings on CPU. Then .to("xpu") for the rest.
+        print(
+            "[INFO] Re-loading model for hybrid usage: CPU embeddings + XPU for main layers.")
+        # re-set the model
+        set_model(current_model_name)
+        return "✅ Using Hybrid: CPU embeddings + XPU."
+
+############################################################################
+# 6) Generation logic (streaming)
+############################################################################
+
+
+def generate_stream(input_ids, streamer, model):
+    def run_generation():
+        model.generate(
+            **input_ids,
+            streamer=streamer,
+            max_new_tokens=256,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            top_k=50,
+            repetition_penalty=1.1
+        )
+    thread = Thread(target=run_generation)
+    thread.start()
+    for new_text in streamer:
+        yield new_text
+
+############################################################################
+# 7) Chat & Summaries
+############################################################################
+
+
+def chat(user_input, history):
+    user_lower = user_input.lower()
+    if ("summarize" in user_lower or "document" in user_lower) and DOC_TEXT:
+        retrieved_context = DOC_TEXT
+    else:
+        retrieved_context = naive_retrieve_snippet(user_input, DOC_TEXT)
+
+    prompt = format_prompt(user_input, retrieved_context, history)
+
+    # Move input_ids to xpu if hybrid or xpu
+    if use_xpu or use_hybrid:
+        input_ids = tokenizer([prompt], return_tensors='pt').to("xpu")
+    else:
+        input_ids = tokenizer([prompt], return_tensors='pt')
+
+    streamer = TextIteratorStreamer(
+        tokenizer, skip_prompt=True, skip_special_tokens=True)
+    stream_output = generate_stream(input_ids, streamer, model_in_4bit_gpu)
+
+    partial_bot_response = ""
+    for piece in stream_output:
+        partial_bot_response += piece
+        updated_history = [list(item) for item in history]
+        updated_history[-1][1] = partial_bot_response
+        yield updated_history
+
+
+def user(query, chat_history):
+    new_history = chat_history + [[query, None]]
+    if len(new_history) > MAX_TURNS_BEFORE_SUMMARY:
+        summary = summarize_history(new_history, model_in_4bit_gpu, tokenizer)
+        new_history = [("Memory summary of previous conversation", summary)]
+    return "", new_history
+
+
+############################################################################
+# 8) Build the Gradio UI
+############################################################################
+print(f"[*] Loading default model: {MODEL_OPTIONS[0]}")
+# default is CPU only
+model_in_4bit, tokenizer = load_model(MODEL_OPTIONS[0], cpu_embedding=False)
+model_in_4bit_gpu = model_in_4bit.to("cpu")
+
+with gr.Blocks() as demo:
+    gr.Markdown("## Hybrid CPU+XPU Option Example")
+
+    with gr.Tabs():
+        with gr.Tab("Chat"):
+            chatbot = gr.Chatbot()
+            user_input = gr.Textbox(
+                show_label=False, placeholder="Ask me anything...")
+            suggestion_btn1 = gr.Button("What is a GPU?")
+            suggestion_btn2 = gr.Button("Differences between CPU & GPU?")
+            suggestion_btn3 = gr.Button("Summarize the uploaded document")
+
+            suggestion_btn1.click(
+                fn=lambda: "What is a GPU?", inputs=None, outputs=user_input)
+            suggestion_btn2.click(
+                fn=lambda: "What are the differences between CPU and GPU?", inputs=None, outputs=user_input)
+            suggestion_btn3.click(
+                fn=lambda: "Summarize the uploaded document", inputs=None, outputs=user_input)
+
+            clear_btn = gr.Button("Clear Chat")
+
+            user_input.submit(fn=user, inputs=[user_input, chatbot], outputs=[user_input, chatbot], queue=False).then(
+                fn=chat,
+                inputs=[user_input, chatbot],
+                outputs=chatbot
+            )
+
+            def clear_fn():
+                return []
+
+            clear_btn.click(fn=clear_fn, inputs=[],
+                            outputs=chatbot, queue=False)
+
+        with gr.Tab("Upload Document"):
+            file_input = gr.File(
+                label="Upload PDF/Word (.pdf, .doc, .docx)", file_types=[".pdf", ".doc", ".docx"])
+            upload_status = gr.Textbox(
+                label="Upload Status", interactive=False)
+
+            def handle_upload(file):
+                return parse_uploaded_file(file)
+
+            file_input.change(fn=handle_upload,
+                              inputs=file_input, outputs=upload_status)
+
+        with gr.Tab("Settings"):
+            model_choice = gr.Dropdown(
+                choices=MODEL_OPTIONS, value=MODEL_OPTIONS[0], label="Select Model", interactive=True)
+            device_choice = gr.Radio(choices=[
+                                     "CPU", "XPU", "CPU+XPU"], value="CPU", label="Select Device", interactive=True)
+            model_status = gr.Textbox(
+                value=f"Current Model: {MODEL_OPTIONS[0]}", label="Model Status", interactive=False)
+            device_status = gr.Textbox(
+                value="Current Device: CPU", label="Device Status", interactive=False)
+
+            def update_model_box(selected_model):
+                msg = set_model(selected_model)
+                return f"Current Model: {selected_model} | {msg}"
+
+            model_choice.change(fn=update_model_box,
+                                inputs=model_choice, outputs=model_status)
+
+            def update_device_box(device):
+                msg = set_device(device)
+                return f"Device: {device} | {msg}"
+
+            device_choice.change(fn=update_device_box,
+                                 inputs=device_choice, outputs=device_status)
+
+demo.queue()
+demo.launch(server_name="0.0.0.0", server_port=7860)
diff --git a/python/llm/example/GPU/DeepSeek-Chatbot/run_chatbot.sh b/python/llm/example/GPU/DeepSeek-Chatbot/run_chatbot.sh
new file mode 100644
index 00000000000..e1c6ad4f773
--- /dev/null
+++ b/python/llm/example/GPU/DeepSeek-Chatbot/run_chatbot.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Shell script to run the DeepSeek Chatbot
+echo "Starting DeepSeek Chatbot with IPEX-LLM support..."
+
+# Activate virtual environment if required (uncomment and modify if needed)
+# source path_to_virtualenv/bin/activate
+
+# Run the chatbot script
+echo "Launching chatbot..."
+python deepseek_chatbot.py