feat: add support to Starling, Llama3 and Phi-3 models

umbertogriffo · May 25, 2024 · 41d9bb4 · 41d9bb4
1 parent 011fa5a
commit 41d9bb4
Show file tree

Hide file tree

Showing 14 changed files with 334 additions and 97 deletions.
diff --git a/README.md b/README.md
diff --git a/chatbot/bot/model/model_settings.py b/chatbot/bot/model/model_settings.py
@@ -1,11 +1,14 @@
 from enum import Enum
 
-from bot.model.dolphin import DolphinSettings
-from bot.model.mistral import MistralSettings
-from bot.model.neural_beagle import NeuralBeagleSettings
-from bot.model.openchat import OpenChatSettings
-from bot.model.stablelm_zephyr import StableLMZephyrSettings
-from bot.model.zephyr import ZephyrSettings
+from bot.model.settings.dolphin import DolphinSettings
+from bot.model.settings.llama_3 import LlamaThreeSettings
+from bot.model.settings.mistral import MistralSettings
+from bot.model.settings.neural_beagle import NeuralBeagleSettings
+from bot.model.settings.openchat import OpenChatSettings
+from bot.model.settings.phi_3 import PhiThreeSettings
+from bot.model.settings.stablelm_zephyr import StableLMZephyrSettings
+from bot.model.settings.starling import StarlingSettings
+from bot.model.settings.zephyr import ZephyrSettings
 
 
 class ModelType(Enum):
@@ -14,7 +17,10 @@ class ModelType(Enum):
     DOLPHIN = "dolphin"
     STABLELM_ZEPHYR = "stablelm-zephyr"
     OPENCHAT = "openchat"
+    STARLING = "starling"
     NEURAL_BEAGLE = "neural-beagle"
+    PHI_3 = "phi-3"
+    LLAMA_3 = "llama-3"
 
 
 SUPPORTED_MODELS = {
@@ -23,7 +29,10 @@ class ModelType(Enum):
     ModelType.DOLPHIN.value: DolphinSettings,
     ModelType.STABLELM_ZEPHYR.value: StableLMZephyrSettings,
     ModelType.OPENCHAT.value: OpenChatSettings,
+    ModelType.STARLING.value: StarlingSettings,
     ModelType.NEURAL_BEAGLE.value: NeuralBeagleSettings,
+    ModelType.PHI_3.value: PhiThreeSettings,
+    ModelType.LLAMA_3.value: LlamaThreeSettings,
 }
 
 

diff --git a/chatbot/bot/model/settings/__init__.py b/chatbot/bot/model/settings/__init__.py
diff --git a/chatbot/bot/model/dolphin.py → chatbot/bot/model/settings/dolphin.py b/chatbot/bot/model/dolphin.py → chatbot/bot/model/settings/dolphin.py
diff --git a/chatbot/bot/model/settings/llama_3.py b/chatbot/bot/model/settings/llama_3.py
@@ -0,0 +1,68 @@
+from bot.client.llm_client import LlmClientType
+from bot.model.model import Model
+
+
+class LlamaThreeSettings(Model):
+    url = "https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
+    file_name = "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
+    clients = [LlmClientType.LAMA_CPP]
+    config = {
+        "n_ctx": 4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
+        "n_threads": 8,  # The number of CPU threads to use, tailor to your system and the resulting performance
+        "n_gpu_layers": 50,  # The number of layers to offload to GPU, if you have GPU acceleration available
+    }
+    config_answer = {"temperature": 0.7, "stop": []}
+    system_template = (
+        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a helpful, respectful and "
+        "honest assistant. <|eot_id|><|start_header_id|>user<|end_header_id|>"
+    )
+    qa_prompt_template = """{system}\n
+Answer the question below:
+{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""
+    ctx_prompt_template = """{system}\n
+Context information is below.
+---------------------
+{context}
+---------------------
+Given the context information and not prior knowledge, answer the question below:
+{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""
+    refined_ctx_prompt_template = """{system}\n
+{question}
+We have provided an existing answer: {existing_answer}
+We have the opportunity to refine the existing answer
+(only if needed) with some more context below.
+---------------------
+{context}
+---------------------
+Given the new context, refine the original answer to better answer the query.
+If the context isn't useful, return the original answer.
+Refined Answer:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""
+    refined_question_conversation_awareness_prompt_template = """{system}\n
+Chat History:
+---------------------
+{chat_history}
+---------------------
+Follow Up Question: {question}
+Given the above conversation and a follow up question, rephrase the follow up question to be a standalone question.
+Standalone question:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""
+
+    refined_answer_conversation_awareness_prompt_template = """
+You are engaging in a conversation with a human participant who is unaware that they might be
+interacting with a machine. \n
+Your goal is to respond in a way that convincingly simulates human-like intelligence and behavior. \n
+The conversation should be natural, coherent, and contextually relevant. \n
+Chat History:
+---------------------
+{chat_history}
+---------------------
+Follow Up Question: {question}\n
+Given the context provided in the Chat History and the follow up question, please answer the follow up question above.
+If the follow up question isn't correlated to the context provided in the Chat History, please just answer the follow up
+question, ignoring the context provided in the Chat History.
+Please also don't reformulate the follow up question, and write just a concise answer.
+<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""
diff --git a/chatbot/bot/model/mistral.py → chatbot/bot/model/settings/mistral.py b/chatbot/bot/model/mistral.py → chatbot/bot/model/settings/mistral.py
diff --git a/chatbot/bot/model/neural_beagle.py → chatbot/bot/model/settings/neural_beagle.py b/chatbot/bot/model/neural_beagle.py → chatbot/bot/model/settings/neural_beagle.py
diff --git a/chatbot/bot/model/openchat.py → chatbot/bot/model/settings/openchat.py b/chatbot/bot/model/openchat.py → chatbot/bot/model/settings/openchat.py
diff --git a/chatbot/bot/model/settings/phi_3.py b/chatbot/bot/model/settings/phi_3.py
@@ -0,0 +1,65 @@
+from bot.client.llm_client import LlmClientType
+from bot.model.model import Model
+
+
+class PhiThreeSettings(Model):
+    url = "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf"
+    file_name = "Phi-3-mini-4k-instruct-q4.gguf"
+    clients = [LlmClientType.LAMA_CPP]
+    config = {
+        "n_ctx": 4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
+        "n_threads": 8,  # The number of CPU threads to use, tailor to your system and the resulting performance
+        "n_gpu_layers": 50,  # The number of layers to offload to GPU, if you have GPU acceleration available
+    }
+    config_answer = {"temperature": 0.7, "stop": []}
+    system_template = "You are a helpful, respectful and honest assistant. "
+    qa_prompt_template = """{system}\n
+<|user|>\n Answer the question below:
+{question}<|end|>\n<|assistant|>
+"""
+    ctx_prompt_template = """{system}\n
+<|user|>\n Context information is below.
+---------------------
+{context}
+---------------------
+Given the context information and not prior knowledge, answer the question below:
+{question}<|end|>\n<|assistant|>
+"""
+    refined_ctx_prompt_template = """{system}\n
+<|user|>\n {question}
+We have provided an existing answer: {existing_answer}
+We have the opportunity to refine the existing answer
+(only if needed) with some more context below.
+---------------------
+{context}
+---------------------
+Given the new context, refine the original answer to better answer the query.
+If the context isn't useful, return the original answer.
+Refined Answer:<|end|>\n<|assistant|>
+"""
+    refined_question_conversation_awareness_prompt_template = """{system}\n
+<|user|>\n Chat History:
+---------------------
+{chat_history}
+---------------------
+Follow Up Question: {question}
+Given the above conversation and a follow up question, rephrase the follow up question to be a standalone question.
+Standalone question:<|end|>\n<|assistant|>
+"""
+
+    refined_answer_conversation_awareness_prompt_template = """
+<|user|>\n You are engaging in a conversation with a human participant who is unaware that they might be
+interacting with a machine. \n
+Your goal is to respond in a way that convincingly simulates human-like intelligence and behavior. \n
+The conversation should be natural, coherent, and contextually relevant. \n
+Chat History:
+---------------------
+{chat_history}
+---------------------
+Follow Up Question: {question}\n
+Given the context provided in the Chat History and the follow up question, please answer the follow up question above.
+If the follow up question isn't correlated to the context provided in the Chat History, please just answer the follow up
+question, ignoring the context provided in the Chat History.
+Please also don't reformulate the follow up question, and write just a concise answer.
+<|end|>\n<|assistant|>
+"""
diff --git a/chatbot/bot/model/stablelm_zephyr.py → ...bot/bot/model/settings/stablelm_zephyr.py b/chatbot/bot/model/stablelm_zephyr.py → ...bot/bot/model/settings/stablelm_zephyr.py
diff --git a/chatbot/bot/model/settings/starling.py b/chatbot/bot/model/settings/starling.py
@@ -0,0 +1,65 @@
+from bot.client.llm_client import LlmClientType
+from bot.model.model import Model
+
+
+class StarlingSettings(Model):
+    url = "https://huggingface.co/bartowski/Starling-LM-7B-beta-GGUF/resolve/main/Starling-LM-7B-beta-Q4_K_M.gguf"
+    file_name = "Starling-LM-7B-beta-Q4_K_M.gguf"
+    clients = [LlmClientType.LAMA_CPP]
+    config = {
+        "n_ctx": 4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
+        "n_threads": 8,  # The number of CPU threads to use, tailor to your system and the resulting performance
+        "n_gpu_layers": 50,  # The number of layers to offload to GPU, if you have GPU acceleration available
+    }
+    config_answer = {"temperature": 0.7, "stop": []}
+    system_template = "You are a helpful, respectful and honest assistant. "
+    qa_prompt_template = """{system}\n
+GPT4 Correct User: Answer the question below:
+{question}<|end_of_turn|>GPT4 Correct Assistant:
+"""
+    ctx_prompt_template = """{system}\n
+GPT4 Correct User: Context information is below.
+---------------------
+{context}
+---------------------
+Given the context information and not prior knowledge, answer the question below:
+{question}<|end_of_turn|>GPT4 Correct Assistant:
+"""
+    refined_ctx_prompt_template = """{system}\n
+GPT4 Correct User: The original query is as follows: {question}
+We have provided an existing answer: {existing_answer}
+We have the opportunity to refine the existing answer
+(only if needed) with some more context below.
+---------------------
+{context}
+---------------------
+Given the new context, refine the original answer to better answer the query.
+If the context isn't useful, return the original answer.
+Refined Answer:<|end_of_turn|>GPT4 Correct Assistant:
+"""
+    refined_question_conversation_awareness_prompt_template = """{system}\n
+GPT4 Correct User: Chat History:
+---------------------
+{chat_history}
+---------------------
+Follow Up Question: {question}
+Given the above conversation and a follow up question, rephrase the follow up question to be a standalone question.
+Standalone question:<|end_of_turn|>GPT4 Correct Assistant:
+"""
+
+    refined_answer_conversation_awareness_prompt_template = """
+GPT4 Correct User: You are engaging in a conversation with a human participant who is unaware that they might be
+interacting with a machine. \n
+Your goal is to respond in a way that convincingly simulates human-like intelligence and behavior. \n
+The conversation should be natural, coherent, and contextually relevant. \n
+Chat History:
+---------------------
+{chat_history}
+---------------------
+Follow Up Question: {question}\n
+Given the context provided in the Chat History and the follow up question, please answer the follow up question above.
+If the follow up question isn't correlated to the context provided in the Chat History, please just answer the follow up
+question, ignoring the context provided in the Chat History.
+Please also don't reformulate the follow up question, and write just a concise answer.
+<|end_of_turn|>GPT4 Correct Assistant:
+"""
diff --git a/chatbot/bot/model/zephyr.py → chatbot/bot/model/settings/zephyr.py b/chatbot/bot/model/zephyr.py → chatbot/bot/model/settings/zephyr.py
diff --git a/todo.md b/todo.md
@@ -1,6 +1,6 @@
 # Todo
-- [ ] `llama-cpp-python` version `0.2.29` has a serious issue https://github.com/abetlen/llama-cpp-python/issues/1089 - Introduce unit tests to update to newer `llama-cpp-python` versions confidently.
-- [ ] try https://huggingface.co/TheBloke/Starling-LM-7B-alpha-GGUF (also the beta version).
-- [ ] try https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf
-- [ ] try Chat Templates https://medium.com/@ahmet_celebi/demystifying-chat-templates-of-llm-using-llama-cpp-and-ctransformers-f17871569cd6
-- [ ] make docker container
+- Test `openchat-3.6-8b-20240522`:
+  - https://huggingface.co/openchat/openchat-3.6-8b-20240522
+  - https://huggingface.co/bartowski/openchat-3.6-8b-20240522-GGUF
+- Try Chat Templates https://medium.com/@ahmet_celebi/demystifying-chat-templates-of-llm-using-llama-cpp-and-ctransformers-f17871569cd6
+- Make docker container
diff --git a/version/llama_cpp b/version/llama_cpp
@@ -1 +1 @@
-0.2.28
+0.2.76