added env variables for SQL & Teradata

Azure · Jun 9, 2024 · 5491551 · 5491551
1 parent 0d7871c
commit 5491551
Show file tree

Hide file tree

Showing 6 changed files with 115 additions and 52 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,6 @@ csx
 .vs
 edge
 Publish
-.vscode/
 
 # Loadtest
 Results/

diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -0,0 +1,6 @@
+{
+  "recommendations": [
+    "ms-azuretools.vscode-azurefunctions",
+    "ms-python.python"
+  ]
+}
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Attach to Python Functions",
+            "type": "debugpy",
+            "request": "attach",
+            "connect": {
+                "host": "localhost",
+                "port": 9091
+            },
+            "preLaunchTask": "func: host start"
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,8 @@
+{
+    "azureFunctions.deploySubpath": ".",
+    "azureFunctions.scmDoBuildDuringDeployment": true,
+    "azureFunctions.pythonVenv": ".venv",
+    "azureFunctions.projectLanguage": "Python",
+    "azureFunctions.projectRuntime": "~4",
+    "debug.internalConsoleOptions": "neverOpen"
+}
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -0,0 +1,27 @@
+{
+	"version": "2.0.0",
+	"tasks": [
+		{
+			"type": "func",
+			"label": "func: host start",
+			"command": "host start",
+			"problemMatcher": "$func-python-watch",
+			"isBackground": true,
+			"dependsOn": "pip install (functions)"
+		},
+		{
+			"label": "pip install (functions)",
+			"type": "shell",
+			"osx": {
+				"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
+			},
+			"windows": {
+				"command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt"
+			},
+			"linux": {
+				"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
+			},
+			"problemMatcher": []
+		}
+	]
+}
diff --git a/orc/plugins/Retrieval/native_function.py b/orc/plugins/Retrieval/native_function.py
@@ -1,4 +1,4 @@
-from shared.util import get_secret, get_aoai_config,extract_text_from_html,get_possitive_int_or_default
+from shared.util import get_secret, get_aoai_config, extract_text_from_html, get_possitive_int_or_default
 # from semantic_kernel.skill_definition import sk_function
 from openai import AzureOpenAI
 from semantic_kernel.functions import kernel_function
@@ -33,34 +33,40 @@
 # Azure search Integration Settings
 AZURE_OPENAI_EMBEDDING_MODEL = os.environ.get("AZURE_OPENAI_EMBEDDING_MODEL")
 
-TERM_SEARCH_APPROACH='term'
-VECTOR_SEARCH_APPROACH='vector'
-HYBRID_SEARCH_APPROACH='hybrid'
-AZURE_SEARCH_USE_SEMANTIC=os.environ.get("AZURE_SEARCH_USE_SEMANTIC")  or "false"
-AZURE_SEARCH_APPROACH=os.environ.get("AZURE_SEARCH_APPROACH") or HYBRID_SEARCH_APPROACH
+TERM_SEARCH_APPROACH = 'term'
+VECTOR_SEARCH_APPROACH = 'vector'
+HYBRID_SEARCH_APPROACH = 'hybrid'
+AZURE_SEARCH_USE_SEMANTIC = os.environ.get("AZURE_SEARCH_USE_SEMANTIC") or "false"
+AZURE_SEARCH_APPROACH = os.environ.get("AZURE_SEARCH_APPROACH") or HYBRID_SEARCH_APPROACH
 
 AZURE_SEARCH_SERVICE = os.environ.get("AZURE_SEARCH_SERVICE")
 AZURE_SEARCH_INDEX = os.environ.get("AZURE_SEARCH_INDEX")
 AZURE_SEARCH_API_VERSION = os.environ.get("AZURE_SEARCH_API_VERSION", "2023-11-01")
-if AZURE_SEARCH_API_VERSION < '2023-10-01-Preview': # query is using vectorQueries that requires at least 2023-10-01-Preview'.
-    AZURE_SEARCH_API_VERSION = '2023-11-01'  
+if AZURE_SEARCH_API_VERSION < '2023-10-01-Preview':  # query is using vectorQueries that requires at least 2023-10-01-Preview'.
+    AZURE_SEARCH_API_VERSION = '2023-11-01'
 
 AZURE_SEARCH_TOP_K = os.environ.get("AZURE_SEARCH_TOP_K") or "3"
 
 AZURE_SEARCH_OYD_USE_SEMANTIC_SEARCH = os.environ.get("AZURE_SEARCH_OYD_USE_SEMANTIC_SEARCH") or "false"
 AZURE_SEARCH_OYD_USE_SEMANTIC_SEARCH = True if AZURE_SEARCH_OYD_USE_SEMANTIC_SEARCH == "true" else False
 AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG = os.environ.get("AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG") or "my-semantic-config"
 AZURE_SEARCH_ENABLE_IN_DOMAIN = os.environ.get("AZURE_SEARCH_ENABLE_IN_DOMAIN") or "true"
-AZURE_SEARCH_ENABLE_IN_DOMAIN =  True if AZURE_SEARCH_ENABLE_IN_DOMAIN == "true" else False
+AZURE_SEARCH_ENABLE_IN_DOMAIN = True if AZURE_SEARCH_ENABLE_IN_DOMAIN == "true" else False
 AZURE_SEARCH_CONTENT_COLUMNS = os.environ.get("AZURE_SEARCH_CONTENT_COLUMNS") or "content"
 AZURE_SEARCH_FILENAME_COLUMN = os.environ.get("AZURE_SEARCH_FILENAME_COLUMN") or "filepath"
 AZURE_SEARCH_TITLE_COLUMN = os.environ.get("AZURE_SEARCH_TITLE_COLUMN") or "title"
 AZURE_SEARCH_URL_COLUMN = os.environ.get("AZURE_SEARCH_URL_COLUMN") or "url"
-#Bing search Integration Settings
+# Bing search Integration Settings
 BING_SEARCH_TOP_K = os.environ.get("BING_SEARCH_TOP_K") or "3"
-BING_CUSTOM_SEARCH_URL="https://api.bing.microsoft.com/v7.0/custom/search?"
+BING_CUSTOM_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/custom/search?"
 BING_SEARCH_MAX_TOKENS = os.environ.get("BING_SEARCH_MAX_TOKENS") or "1000"
-#DB Integration Settings
+# SQL Integration Settings
+SQL_TOP_K = os.environ.get("SQL_TOP_K") or "3"
+SQL_MAX_TOKENS = os.environ.get("SQL_MAX_TOKENS") or "1000"
+# Teradata Integration Settings
+TERADATA_TOP_K = os.environ.get("TERADATA_TOP_K") or "3"
+TERADATA_MAX_TOKENS = os.environ.get("TERADATA_MAX_TOKENS") or "1000"
+# DB Integration Settings
 AZURE_OPENAI_CHATGPT_MODEL = os.environ.get("AZURE_OPENAI_CHATGPT_MODEL")
 AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.environ.get("AZURE_OPENAI_CHATGPT_DEPLOYMENT")
 AZURE_OPENAI_RESOURCE = os.environ.get("AZURE_OPENAI_RESOURCE")
@@ -77,19 +83,17 @@
 @retry(wait=wait_random_exponential(min=2, max=60), stop=stop_after_attempt(6), reraise=True)
 # Function to generate embeddings for title and content fields, also used for query embeddings
 def generate_embeddings(text):
-
     embeddings_config = get_aoai_config(AZURE_OPENAI_EMBEDDING_MODEL)
 
     client = AzureOpenAI(
-        api_version = embeddings_config['api_version'],
-        azure_endpoint = embeddings_config['endpoint'],
-        azure_ad_token = embeddings_config['api_key'],
+        api_version=embeddings_config['api_version'],
+        azure_endpoint=embeddings_config['endpoint'],
+        azure_ad_token=embeddings_config['api_key'],
     )
-
-    embeddings =  client.embeddings.create(input = [text], model= embeddings_config['deployment']).data[0].embedding
 
-    return embeddings
+    embeddings = client.embeddings.create(input=[text], model=embeddings_config['deployment']).data[0].embedding
 
+    return embeddings
 
 class Retrieval:
     @kernel_function(
@@ -106,16 +110,16 @@ def VectorIndexRetrieval(
             start_time = time.time()
             logging.info(f"[sk_retrieval] generating question embeddings. search query: {search_query}")
             embeddings_query = generate_embeddings(search_query)
-            response_time = round(time.time() - start_time,2)
+            response_time = round(time.time() - start_time, 2)
             logging.info(f"[sk_retrieval] finished generating question embeddings. {response_time} seconds")
-            azureSearchKey = get_secret('azureSearchKey') 
+            azureSearchKey = get_secret('azureSearchKey')
 
             logging.info(f"[sk_retrieval] querying azure ai search. search query: {search_query}")
             # prepare body
             body = {
                 "select": "title, content, url, filepath, chunk_id",
                 "top": AZURE_SEARCH_TOP_K
-            }    
+            }
             if AZURE_SEARCH_APPROACH == TERM_SEARCH_APPROACH:
                 body["search"] = search_query
             elif AZURE_SEARCH_APPROACH == VECTOR_SEARCH_APPROACH:
@@ -143,7 +147,7 @@ def VectorIndexRetrieval(
                 'api-key': azureSearchKey
             }
             search_endpoint = f"https://{AZURE_SEARCH_SERVICE}.search.windows.net/indexes/{AZURE_SEARCH_INDEX}/docs/search?api-version={AZURE_SEARCH_API_VERSION}"
-            
+
             start_time = time.time()
             response = requests.post(search_endpoint, headers=headers, json=body)
             status_code = response.status_code
@@ -154,23 +158,22 @@ def VectorIndexRetrieval(
                 logging.error(f"[sk_retrieval] error {status_code} when searching documents. {error_message}")
             else:
                 if response.json()['value']:
-                        for doc in response.json()['value']:
-                            search_results.append(doc['filepath'] + ": "+ doc['content'].strip() + "\n")    
-
-            response_time =  round(time.time() - start_time,2)
-            # logging.info(f"[sk_retrieval] search query body: {body}")        
+                    for doc in response.json()['value']:
+                        search_results.append(doc['filepath'] + ": " + doc['content'].strip() + "\n")
+
+            response_time = round(time.time() - start_time, 2)
             logging.info(f"[sk_retrieval] finished querying azure ai search. {response_time} seconds")
         except Exception as e:
             error_message = str(e)
             logging.error(f"[sk_retrieval] error when getting the answer {error_message}")
-        
+
         sources = ' '.join(search_results)
         return sources
-    
+
     @kernel_function(
         description="Search bing for sources to ground and give context to answer a user question. Return sources.",
         name="BingRetrieval",
-        )
+    )
     def BingRetrieval(
         self,
         input: Annotated[str, "The user question"]
@@ -179,34 +182,34 @@ def BingRetrieval(
         bing_custom_config_id = get_secret('bingCustomConfigId')
         client = CustomSearchClient(endpoint=BING_CUSTOM_SEARCH_URL, credentials=CognitiveServicesCredentials(bing_custom_search_subscription_key))
         start_time = time.time()
-        web_data = client.custom_instance.search(query=input, custom_config=bing_custom_config_id,count=BING_SEARCH_TOP_K)
+        web_data = client.custom_instance.search(query=input, custom_config=bing_custom_config_id, count=BING_SEARCH_TOP_K)
         logging.info(f"[bing retrieval] bing search. {input}. {time.time() - start_time} seconds.")
         bing_sources = ""
         if web_data.web_pages and hasattr(web_data.web_pages, 'value'):
             for web in web_data.web_pages.value:
                 try:
                     start_time = time.time()
-                    html=extract_text_from_html(web.url)  
-                    bing_sources+=html[:get_possitive_int_or_default(BING_SEARCH_MAX_TOKENS,1000)]
+                    html = extract_text_from_html(web.url)
+                    bing_sources += html[:get_possitive_int_or_default(BING_SEARCH_MAX_TOKENS, 1000)]
                     logging.info(f"[bing retrieval] finished scraping web. {web.url}. {time.time() - start_time} seconds.")
                 except Exception as e:
                     logging.error(f"[bing retrieval] could not scrape web. {web.url}. {e}")
-                    bing_sources+=web.snippet
+                    bing_sources += web.snippet
         return bing_sources
-    
+
     @kernel_function(
         description="Search a SQL or Teradata DB for sources to ground and give context to answer a user question. Return sources.",
         name="DBRetrieval",
-        )
+    )
     def DBRetrieval(self,
-                       input: Annotated[str, "The user question"],
-                       db_type: Annotated[str, "The type of database to connect to (sql or teradata)"],
-                       db_server: Annotated[str, "The server to connect to"],
-                       db_database: Annotated[str, "The database to connect to"],
-                       db_table_info: Annotated[str, "The tables to search for information"],
-                       db_username: Annotated[str, "The username to connect to the database"],
-                       db_top_k: Annotated[str, "The number of results to return"]
-                       )-> Annotated[str, "the output is a string with the search results"]:
+                    input: Annotated[str, "The user question"],
+                    db_type: Annotated[str, "The type of database to connect to (sql or teradata)"],
+                    db_server: Annotated[str, "The server to connect to"],
+                    db_database: Annotated[str, "The database to connect to"],
+                    db_table_info: Annotated[str, "The tables to search for information"],
+                    db_username: Annotated[str, "The username to connect to the database"],
+                    db_top_k: Annotated[str, "The number of results to return"]
+                    ) -> Annotated[str, "the output is a string with the search results"]:
         logging.info('Python HTTP trigger function processed a request.')
 
         try:
@@ -216,10 +219,14 @@ def DBRetrieval(self,
             # Connect to Key Vault and get database password
             if db_type == "sql":
                 db_password = get_secret("sqlpassword")
+                db_top_k = get_possitive_int_or_default(db_top_k, SQL_TOP_K)
+                max_tokens = get_possitive_int_or_default(SQL_MAX_TOKENS, 1000)
             elif db_type == "teradata":
                 db_password = get_secret("teradatapassword")
+                db_top_k = get_possitive_int_or_default(db_top_k, TERADATA_TOP_K)
+                max_tokens = get_possitive_int_or_default(TERADATA_MAX_TOKENS, 1000)
             else:
-                logging.error(f"[DBRetrieval]Invalid db_type specified")
+                logging.error(f"[DBRetrieval] Invalid db_type specified")
                 return ""
             azureOpenAIKey = get_secret("azureOpenAIKey")
 
@@ -237,10 +244,11 @@ def DBRetrieval(self,
                 conn_str = f'mssql+pyodbc:///?odbc_connect={params}'
             elif db_type == "teradata":
                 driver = 'Teradata'
-                params = urllib.parse.quote_plus(f"DRIVER={driver};SERVER={db_server};DATABASE={db_database};UID={db_username};PWD={db_password}")
+                params = urllib.parse.quote_plus(f"DRIVER={driver};DBCNAME={db_server};DATABASE={db_database};UID={db_username};PWD={db_password}")
                 conn_str = f'teradata:///?odbc_connect={params}'
             else:
                 logging.error("[DBRetrieval] Invalid db_type specified")
+                return ""
 
             engine = create_engine(conn_str)
             logging.info(f"[{db_type} Retrieval] Connection to database is successful")
@@ -287,12 +295,12 @@ def DBRetrieval(self,
             )
 
             query_engine = SQLTableRetrieverQueryEngine(
-                sql_database, obj_index.as_retriever(similarity_top_k=get_possitive_int_or_default(db_top_k, 3))
+                sql_database, obj_index.as_retriever(similarity_top_k=db_top_k)
             )
 
-            query = input
+            query = input[:max_tokens]
             response = query_engine.query(query)
-            result=response.response
+            result = response.response
             logging.info(f"[{db_type} Retrieval] SQLQuery: {response.metadata.get('sql_query')}")
             engine.dispose()
             return result
@@ -301,4 +309,4 @@ def DBRetrieval(self,
             return ""
         except Exception as e:
             logging.error(f"[{db_type} Retrieval]  Unexpected error: {e}")
-            return ""
+            return ""
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,7 +7,6 @@ csx @@
     .vs
     edge
     Publish
-    .vscode/
     # Loadtest
     Results/
@@ Expand Down @@