OpenFn · josephjclark · Nov 27, 2024 · Nov 27, 2024
diff --git a/services/docsite_explainer/README.md b/services/docsite_explainer/README.md
@@ -0,0 +1,31 @@
+# Docsite Explainer
+
+This is a service for explaining passages in the OpenFn documentation.
+
+This service uses an input text to query a Claude model for a clarification. It also searches an embedding collection containing the full documentation to give the model passages from it as additional context.
+
+## Implementation Notes
+
+This service currently uses the Zilliz embedding storage provider for OpenAI embeddings, and requires both Zilliz (URI, Token) and OpenAI credentials to run.
+
+## Usage
+
+This service can be run from the services folder via the entry.py module, using a JSON file containing the text that needs clarification:
+
+```bash
+python entry.py docsite_explainer tmp/docsite_explainer_input.json tmp/docsite_output.json
+```
+
+JSON input format example:
+
+```json
+{
+    "text": "JavaScript is an untyped language - which is very conveient for OpenFn jobs and usually makes life easier."
+}
+```
+
+This service can also be run using CURL:
+
+```bash
+curl -X POST http://localhost:3000/services/docsite_explainer --json @tmp/docsite_explainer_input.json
+```
diff --git a/services/docsite_explainer/docsite_explainer.py b/services/docsite_explainer/docsite_explainer.py
@@ -0,0 +1,89 @@
+import os
+from dotenv import load_dotenv
+import anthropic
+from search.search import Payload, validate_payload, connect_to_milvus, get_search_embeddings, search_database, extract_documents
+from docsite_explainer.prompts import build_prompt
+
+load_dotenv()
+ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+ZILLIZ_URI = os.getenv("ZILLIZ_CLOUD_URI")
+ZILLIZ_TOKEN = os.getenv("ZILLIZ_CLOUD_API_KEY")
+
+client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
+
+
+def query_llm(system_prompt, formatted_user_prompt, model="claude-3-5-sonnet-20241022", max_tokens=500):
+    message = client.messages.create(
+        model=model,
+        max_tokens=max_tokens,
+        temperature=0,
+        system=system_prompt,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": formatted_user_prompt
+                    }
+                ]
+            }
+        ]
+    )
+    answer = message.content[0].text
+
+    return answer
+
+def explain_docs(input_text):
+
+    # Set and validate database settings 
+    settings_dict = {"api_key":OPENAI_API_KEY, "query":input_text, "partition_name":"normal_docs", "limit": 4}
+    data = Payload(settings_dict)
+    validate_payload(data)
+
+    # Connect to Milvus database
+    milvus_client = connect_to_milvus(db_name="openfn_docs")
+
+    # Generate embeddings for the search query
+    search_embeddings = get_search_embeddings(api_key=data.api_key, query=data.query)
+
+    # Perform the search
+    limit = int(data.limit)
+    res = search_database(milvus_client, search_embeddings, data.partition_name, limit)
+
+    # Extract documents from search results
+    documents = extract_documents(res)
+
+    # Collate llm input
+    context_dict = {
+        "input_text" : input_text,
+        "context" : documents[0],
+        "additional_context_a" : documents[1],
+        "additional_context_b" : documents[2],
+        "additional_context_c" : documents[3],
+    }
+
+    # Get formatted prompts
+    system_prompt, formatted_user_prompt = build_prompt(context_dict)
+
+    # Query llm with the prompts
+    result = query_llm(system_prompt, formatted_user_prompt)
+
+    return result
+
+
+def main(data):
+
+    input_text = data.get("text", "")
+
+    result = explain_docs(input_text)
+    print(f"Input query: {input_text}")
+    print(f"Answer: {result}")
+
+    return {
+            "answer": result
+        }
+
+if __name__ == "__main__":
+    main()
diff --git a/services/docsite_explainer/prompts.py b/services/docsite_explainer/prompts.py
@@ -0,0 +1,32 @@
+
+system_prompt = """
+You are a helpful assistant for understanding the documentation for OpenFn,
+the world's leading digital public good for workflow automation. You will
+get a passage from the documentation which you will need to explain more clearly.
+You will also get the immediate context from which the text is taken, and additional
+passages from the documentation which may provide more information. You can use
+this additional context or broader sector-specific knowledge (e.g. programming) to explain the passage.
+Keep your answers short, friendly and professional. 
+"""
+
+user_prompt = """
+Passage to explain: "{input_text}"
+
+Here's the context from which the text is from:
+
+"{context}"
+
+Here's additional context from the documentation which may be useful:
+
+"...{additional_context_a}..."
+
+"...{additional_context_b}..."
+
+"...{additional_context_c}..."
+"""
+
+
+def build_prompt(context_dict):
+    formatted_user_prompt = user_prompt.format_map(context_dict)
+
+    return system_prompt, formatted_user_prompt
diff --git a/services/search/search.py b/services/search/search.py
@@ -28,7 +28,7 @@ def validate_payload(data: Payload):
     if data.partition_name and data.partition_name not in ["normal_docs", "adaptor_docs"]:
         raise ValueError("Invalid partition_name. Expected values are 'normal_docs' or 'adaptor_docs'.")
 
-def connect_to_milvus() -> MilvusClient:
+def connect_to_milvus(db_name="openfn_docs") -> MilvusClient:
     """
     Connects to the Milvus database client using environment variables.
 
@@ -43,7 +43,7 @@ def connect_to_milvus() -> MilvusClient:
 
     logger.info(f"Connecting to Milvus database...")
 
-    return MilvusClient(uri=zilliz_uri, token=zilliz_token, db_name="openfn_docs")
+    return MilvusClient(uri=zilliz_uri, token=zilliz_token, db_name=db_name)
 
 
 def get_search_embeddings(api_key: str, query: str) -> list: