diff --git a/app/src/modules/chatbot.py b/app/src/modules/chatbot.py index 48d8c55..3a849ce 100644 --- a/app/src/modules/chatbot.py +++ b/app/src/modules/chatbot.py @@ -99,9 +99,10 @@ def langchain_rag(self, rag_params, chat_instr, context_instr, input, chat_histo retrieved_documents = retriever.invoke(input) logger.debug("Retrieved %i documents", len(retrieved_documents)) + logger.info("Retrieved %i documents", len(retrieved_documents)) # Retrieve documents for inspection (Use for debugging) - # for i, doc in enumerate(retrieved_documents): - # logger.debug("Document %i %s", i + 1, doc) + for i, doc in enumerate(retrieved_documents): + logger.info("Document %i %s", i + 1, doc) # QA Chain context_messages = [("system", context_instr)] diff --git a/app/src/modules/metadata.py b/app/src/modules/metadata.py index 1132121..b05edff 100644 --- a/app/src/modules/metadata.py +++ b/app/src/modules/metadata.py @@ -139,6 +139,19 @@ def ll_models(): "frequency_penalty": [0.0, 0.0, -1.0, 1.0], "presence_penalty": [0.0, 0.0, -2.0, 2.0], }, + "tgi": { + "enabled": False, + "api": "OpenAI", + "url": "http://127.0.0.1:8080", + "api_key": "", + "openai_compat": True, + "context_length": 127072, + "temperature": [1.0, 1.0, 0.0, 2.0], + "top_p": [0.99, .99, 0.0, 0.99], + "max_tokens": [256, 256, 1, 8191], + "frequency_penalty": [0.0, 0.0, -1.0, 1.0], + "presence_penalty": [0.0, 0.0, -2.0, 2.0], + }, "gpt-4o": { "enabled": os.getenv("OPENAI_API_KEY") is not None, "api": "OpenAI", @@ -192,6 +205,20 @@ def ll_models(): "frequency_penalty": [0.0, 0.0, -2.0, 2.0], "presence_penalty": [0.0, 0.0, -2.0, 2.0], }, + # llama3.2-3b + "llama3.2": { + "enabled": os.getenv("ON_PREM_OLLAMA_URL") is not None, + "api": "ChatOllama", + "url": os.environ.get("ON_PREM_OLLAMA_URL", default="http://127.0.0.1:11434"), + "api_key": "", + "openai_compat": True, + "context_length": 131072, + "temperature": [1.0, 1.0, 0.0, 2.0], + "top_p": [1.0, 1.0, 0.0, 1.0], + "max_tokens": [256, 256, 1, 2048], + "frequency_penalty": [0.0, 0.0, -2.0, 2.0], + "presence_penalty": [0.0, 0.0, -2.0, 2.0], + }, } return ll_models_dict diff --git a/app/src/modules/utilities.py b/app/src/modules/utilities.py index 9c80d37..5ed4ad0 100644 --- a/app/src/modules/utilities.py +++ b/app/src/modules/utilities.py @@ -110,7 +110,7 @@ def get_ll_model(model, ll_models_config=None, giskarded=False): _client = OpenAI(api_key=giskard_key, base_url=f"{llm_url}/v1/") client = OpenAIClient(model=model, client=_client) elif llm_api == "OpenAI": - client = ChatOpenAI(api_key=lm_params["api_key"], **common_params) + client = ChatOpenAI(api_key=lm_params["api_key"],base_url=f"{llm_url}/v1/", **common_params) elif llm_api == "Cohere": client = ChatCohere(cohere_api_key=lm_params["api_key"], **common_params) elif llm_api == "ChatPerplexity": diff --git a/spring_ai/README.md b/spring_ai/README.md index ff8a9b8..538832a 100644 --- a/spring_ai/README.md +++ b/spring_ai/README.md @@ -122,6 +122,7 @@ ollama: number: 1 models: - llama3.1 + - llama3.2 - mxbai-embed-large - nomic-embed-text nodeSelector: diff --git a/spring_ai/ollama-values.yaml b/spring_ai/ollama-values.yaml new file mode 100644 index 0000000..120f621 --- /dev/null +++ b/spring_ai/ollama-values.yaml @@ -0,0 +1,13 @@ +ollama: + gpu: + enabled: true + type: 'nvidia' + number: 1 + models: + - llama3.1 + - llama3.2 + - mxbai-embed-large + - nomic-embed-text +nodeSelector: + node.kubernetes.io/instance-type: VM.GPU.A10.1 + \ No newline at end of file