diff --git a/services/docsite_explainer/README.md b/services/docsite_explainer/README.md new file mode 100644 index 00000000..c86091a2 --- /dev/null +++ b/services/docsite_explainer/README.md @@ -0,0 +1,31 @@ +# Docsite Explainer + +This is a service for explaining passages in the OpenFn documentation. + +This service uses an input text to query a Claude model for a clarification. It also searches an embedding collection containing the full documentation to give the model passages from it as additional context. + +## Implementation Notes + +This service currently uses the Zilliz embedding storage provider for OpenAI embeddings, and requires both Zilliz (URI, Token) and OpenAI credentials to run. + +## Usage + +This service can be run from the services folder via the entry.py module, using a JSON file containing the text that needs clarification: + +```bash +python entry.py docsite_explainer tmp/docsite_explainer_input.json tmp/docsite_output.json +``` + +JSON input format example: + +```json +{ + "text": "JavaScript is an untyped language - which is very conveient for OpenFn jobs and usually makes life easier." +} +``` + +This service can also be run using CURL: + +```bash +curl -X POST http://localhost:3000/services/docsite_explainer --json @tmp/docsite_explainer_input.json +``` \ No newline at end of file diff --git a/services/docsite_explainer/docsite_explainer.py b/services/docsite_explainer/docsite_explainer.py new file mode 100644 index 00000000..a349ba4e --- /dev/null +++ b/services/docsite_explainer/docsite_explainer.py @@ -0,0 +1,89 @@ +import os +from dotenv import load_dotenv +import anthropic +from search.search import Payload, validate_payload, connect_to_milvus, get_search_embeddings, search_database, extract_documents +from docsite_explainer.prompts import build_prompt + +load_dotenv() +ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +ZILLIZ_URI = os.getenv("ZILLIZ_CLOUD_URI") +ZILLIZ_TOKEN = os.getenv("ZILLIZ_CLOUD_API_KEY") + +client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY) + + +def query_llm(system_prompt, formatted_user_prompt, model="claude-3-5-sonnet-20241022", max_tokens=500): + message = client.messages.create( + model=model, + max_tokens=max_tokens, + temperature=0, + system=system_prompt, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": formatted_user_prompt + } + ] + } + ] + ) + answer = message.content[0].text + + return answer + +def explain_docs(input_text): + + # Set and validate database settings + settings_dict = {"api_key":OPENAI_API_KEY, "query":input_text, "partition_name":"normal_docs", "limit": 4} + data = Payload(settings_dict) + validate_payload(data) + + # Connect to Milvus database + milvus_client = connect_to_milvus(db_name="openfn_docs") + + # Generate embeddings for the search query + search_embeddings = get_search_embeddings(api_key=data.api_key, query=data.query) + + # Perform the search + limit = int(data.limit) + res = search_database(milvus_client, search_embeddings, data.partition_name, limit) + + # Extract documents from search results + documents = extract_documents(res) + + # Collate llm input + context_dict = { + "input_text" : input_text, + "context" : documents[0], + "additional_context_a" : documents[1], + "additional_context_b" : documents[2], + "additional_context_c" : documents[3], + } + + # Get formatted prompts + system_prompt, formatted_user_prompt = build_prompt(context_dict) + + # Query llm with the prompts + result = query_llm(system_prompt, formatted_user_prompt) + + return result + + +def main(data): + + input_text = data.get("text", "") + + result = explain_docs(input_text) + print(f"Input query: {input_text}") + print(f"Answer: {result}") + + return { + "answer": result + } + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/services/docsite_explainer/prompts.py b/services/docsite_explainer/prompts.py new file mode 100644 index 00000000..4f9445be --- /dev/null +++ b/services/docsite_explainer/prompts.py @@ -0,0 +1,32 @@ + +system_prompt = """ +You are a helpful assistant for understanding the documentation for OpenFn, +the world's leading digital public good for workflow automation. You will +get a passage from the documentation which you will need to explain more clearly. +You will also get the immediate context from which the text is taken, and additional +passages from the documentation which may provide more information. You can use +this additional context or broader sector-specific knowledge (e.g. programming) to explain the passage. +Keep your answers short, friendly and professional. +""" + +user_prompt = """ +Passage to explain: "{input_text}" + +Here's the context from which the text is from: + +"{context}" + +Here's additional context from the documentation which may be useful: + +"...{additional_context_a}..." + +"...{additional_context_b}..." + +"...{additional_context_c}..." +""" + + +def build_prompt(context_dict): + formatted_user_prompt = user_prompt.format_map(context_dict) + + return system_prompt, formatted_user_prompt \ No newline at end of file diff --git a/services/search/search.py b/services/search/search.py index c789313c..395bff4d 100644 --- a/services/search/search.py +++ b/services/search/search.py @@ -28,7 +28,7 @@ def validate_payload(data: Payload): if data.partition_name and data.partition_name not in ["normal_docs", "adaptor_docs"]: raise ValueError("Invalid partition_name. Expected values are 'normal_docs' or 'adaptor_docs'.") -def connect_to_milvus() -> MilvusClient: +def connect_to_milvus(db_name="openfn_docs") -> MilvusClient: """ Connects to the Milvus database client using environment variables. @@ -43,7 +43,7 @@ def connect_to_milvus() -> MilvusClient: logger.info(f"Connecting to Milvus database...") - return MilvusClient(uri=zilliz_uri, token=zilliz_token, db_name="openfn_docs") + return MilvusClient(uri=zilliz_uri, token=zilliz_token, db_name=db_name) def get_search_embeddings(api_key: str, query: str) -> list: