Support models used by text2sql project [Initial commit] (#1)

zedong-peng · web-flow · commit 3cf1f7dd72c9 · 2024-04-08T13:06:55.000+08:00
diff --git a/pyspark_ai/ai_utils.py b/pyspark_ai/ai_utils.py
@@ -23,6 +23,9 @@ def __init__(self, spark_ai, df_instance: DataFrame):
         self.spark_ai = spark_ai
         self.df_instance = df_instance
 
+    def transform_tpch(self, desc: str, table: str, cache: bool = False) -> DataFrame:
+        return self.spark_ai.transform_df_tpch(desc, table, cache)
+
     def transform(self, desc: str, cache: bool = True) -> DataFrame:
         """
         Transform the DataFrame using the given description.
diff --git a/pyspark_ai/prompt.py b/pyspark_ai/prompt.py
@@ -171,23 +171,25 @@
     spark_sql_shared_example_4,
 ]
 
-SPARK_SQL_SUFFIX = """\nQuestion: Given a Spark temp view `{view_name}` {comment}.
-
-Here are column names and sample values from each column, to help you understand the columns in the dataframe.
-The format will be (column_name, type, [sample_value_1, sample_value_2...])... 
-Use these column names and sample values to help you choose which columns to query.
-It's very important to ONLY use the verbatim column_name in your resulting SQL query; DO NOT include the type.
+SPARK_SQL_SUFFIX = """\nQuestion: Given a Spark temp view `{view_name}` {comment} with the following sample vals,
+ in the format (column_name, type, [sample_value_1, sample_value_2...]):
+```
 {sample_vals}
-
-Write a Spark SQL query to retrieve the following from view `{view_name}`: {desc}
+```
+Write a Spark SQL query to retrieve from view `{view_name}`: {desc}
+Answer:
 """
 
 SPARK_SQL_SUFFIX_FOR_AGENT = SPARK_SQL_SUFFIX + "\n{agent_scratchpad}"
 
 SPARK_SQL_PREFIX = """You are an assistant for writing professional Spark SQL queries. 
-Given a question, you need to write a Spark SQL query to answer the question. The result is ALWAYS a Spark SQL query.
-Use the COUNT SQL function when the query asks for total number of some non-countable column.
-Use the SUM SQL function to accumulate the total number of countable column values."""
+Given a question, you need to write a Spark SQL query to answer the question.
+The rules that you should follow for answering question:
+1.The answer only consists of Spark SQL query. No explaination. No 
+2.SQL statements should be  Spark SQL query.
+3.ONLY use the verbatim column_name in your resulting SQL query; DO NOT include the type.
+4.Use the COUNT SQL function when the query asks for total number of some non-countable column.
+5.Use the SUM SQL function to accumulate the total number of countable column values."""
 
 SPARK_SQL_PREFIX_VECTOR_SEARCH = (
     SPARK_SQL_PREFIX
diff --git a/pyspark_ai/pyspark_ai.py b/pyspark_ai/pyspark_ai.py
@@ -2,6 +2,7 @@
 import io
 import os
 import re
+import time
 from typing import Callable, List, Optional
 from urllib.parse import urlparse
 
@@ -480,6 +481,7 @@ def _get_sql_query(
             )
         else:
             # Otherwise, generate the SQL query with a prompt with few-shot examples
+            print(f"-------------------------Start generating sql query with a prompt with few-shot examples-------------------------\n\n")
             return self.sql_chain.run(
                 view_name=temp_view_name,
                 sample_vals=sample_vals_str,
@@ -496,6 +498,7 @@ def _get_transform_sql_query(self, df: DataFrame, desc: str, cache: bool) -> str
         df.createOrReplaceTempView(temp_view_name)
         schema_lst = self._get_df_schema(df)
         schema_str = "\n".join(schema_lst)
+        print(f"-------------------------Current table schema from df is:-------------------------\n\n {schema_str}\n")
         sample_rows = self._get_sample_spark_rows(df)
         schema_row_lst = []
         for index in range(len(schema_lst)):
@@ -505,8 +508,9 @@ def _get_transform_sql_query(self, df: DataFrame, desc: str, cache: bool) -> str
             curr_schema_row = f"({schema_lst[index]}, {str(sample_vals)})"
             schema_row_lst.append(curr_schema_row)
         sample_vals_str = "\n".join([str(val) for val in schema_row_lst])
+        print(f"-------------------------Current sample vals are:-------------------------\n\n {sample_vals_str}\n")
         comment = self._get_table_comment(df)
-
+        print(f"-------------------------Current table comment is-------------------------\n\n {comment}\n")
         if cache:
             cache_key = ReActSparkSQLAgent.cache_key(desc, schema_str)
             cached_result = self._cache.lookup(key=cache_key)
@@ -523,6 +527,65 @@ def _get_transform_sql_query(self, df: DataFrame, desc: str, cache: bool) -> str
         else:
             return self._get_sql_query(temp_view_name, sample_vals_str, comment, desc)
 
+
+    def _get_table_schema(self, table: str) -> list:
+        df = self._spark.sql(f"select * from {table}")
+        schema_lst = [f"{name}, {dtype}" for name, dtype in df.dtypes]
+        return schema_lst
+
+    def _get_sample_spark_rows(self, df: DataFrame) -> list:
+
+        if self._sample_rows_in_table_info <= 0:
+            return []
+        try:
+            sample_rows = SparkUtils.get_dataframe_results(df.limit(3))
+            return sample_rows
+        except Exception:
+            # If fail to get sample rows, return empty list
+            return []
+
+    def _get_sample_spark_rows_tpch(self, table: str) -> list:
+ 
+        if self._sample_rows_in_table_info <= 0:
+            return []
+        df = self._spark.sql(f"select * from {table}")
+        try:
+            sample_rows = SparkUtils.get_dataframe_results(df.limit(3))
+            return sample_rows
+        except Exception:
+            # If fail to get sample rows, return empty list
+            return []
+
+    def _get_transform_sql_query_tpch(self, desc: str, table: str, cache: bool) -> str:
+        self.log(f"Retrieve table schema for {table} \n")
+        schema_lst = self._get_table_schema(table)
+        schema_str = "\n".join(schema_lst)
+        print(f"-------------------------Current table schema from df is:-------------------------\n\n {schema_str}\n")
+        sample_rows = self._get_sample_spark_rows_tpch(table)
+        schema_row_lst = []
+        for index in range(len(schema_lst)):
+            sample_vals = []
+            for sample_row in sample_rows:
+                sample_vals.append(sample_row[index])
+            curr_schema_row = f"({schema_lst[index]}, {str(sample_vals)})"
+            schema_row_lst.append(curr_schema_row)
+        sample_vals_str = "\n".join([str(val) for val in schema_row_lst])
+        print(f"-------------------------Current sample vals are:-------------------------\n\n {sample_vals_str}\n")
+        #comment = self._get_table_comment(df)
+        comment = ""
+        #print(f"-------------------------Current table comment is-------------------------\n\n {comment}\n")
+        return self._get_sql_query(table, sample_vals_str, comment, desc)
+
+    def transform_df_tpch(self, desc: str, table: str, cache: bool = False) -> DataFrame:
+        print(f"---------------------TPCH Table {table}------------------------------\n\n")
+        start_time = time.time()
+        sql_query = self._get_transform_sql_query_tpch(desc, table, cache)
+        end_time = time.time()
+        get_transform_sql_query_time = end_time - start_time
+        print(f"-------------------------End get_transform_sql_query-------------------------\n\n get_transform_sql_query_time: {get_transform_sql_query_time} seconds\n")
+        print(f"-------------------------Received query:-------------------------\n\n {sql_query}\n")
+        return self._spark.sql(sql_query)
+
     def transform_df(self, df: DataFrame, desc: str, cache: bool = True) -> DataFrame:
         """
         This method applies a transformation to a provided Spark DataFrame,
@@ -535,7 +598,13 @@ def transform_df(self, df: DataFrame, desc: str, cache: bool = True) -> DataFram
         :return: Returns a new Spark DataFrame that is the result of applying the specified transformation
                  on the input DataFrame.
         """
+        print(f"-------------------------Start get_transform_sql_query-------------------------\n\n")
+        start_time = time.time()
         sql_query = self._get_transform_sql_query(df, desc, cache)
+        end_time = time.time()
+        get_transform_sql_query_time = end_time - start_time
+        print(f"-------------------------End get_transform_sql_query-------------------------\n\n get_transform_sql_query_time: {get_transform_sql_query_time} seconds\n")
+        print(f"-------------------------Received query:-------------------------\n\n {sql_query}\n")
         return self._spark.sql(sql_query)
 
     def explain_df(self, df: DataFrame, cache: bool = True) -> str:
diff --git a/pyspark_ai/python_executor.py b/pyspark_ai/python_executor.py
@@ -38,9 +38,9 @@ def run(
     ) -> str:
         assert not args, "The chain expected no arguments"
         # assert llm is an instance of BaseChatModel
-        assert isinstance(
-            self.llm, BaseChatModel
-        ), "The llm is not an instance of BaseChatModel"
+        #assert isinstance(
+        #    self.llm, BaseChatModel
+        #), "The llm is not an instance of BaseChatModel"
         prompt_str = canonize_string(self.prompt.format_prompt(**kwargs).to_string())
         use_cache = tags != SKIP_CACHE_TAGS
         if self.cache is not None:
diff --git a/pyspark_ai/spark_sql_chain.py b/pyspark_ai/spark_sql_chain.py
@@ -5,6 +5,7 @@
 from langchain.chat_models.base import BaseChatModel
 from langchain.schema import BaseMessage, HumanMessage
 from pyspark.sql import SparkSession
+from langchain_core.language_models.llms import BaseLLM
 
 from pyspark_ai.ai_utils import AIUtils
 from pyspark_ai.code_logger import CodeLogger
@@ -29,24 +30,28 @@ def run(
     ) -> str:
         assert not args, "The chain expected no arguments"
         # assert llm is an instance of BaseChatModel
-        assert isinstance(
-            self.llm, BaseChatModel
-        ), "The llm is not an instance of BaseChatModel"
+        #assert isinstance(
+        #    self.llm, BaseChatModel
+        #), "The llm is not an instance of BaseChatModel"
         prompt_str = self.prompt.format_prompt(**kwargs).to_string()
+        print(f"-------------------------Input prompt is:-------------------------\n\n {prompt_str}\n")
         messages = [HumanMessage(content=prompt_str)]
         return self._generate_code_with_retries(self.llm, messages, self.max_retries)
 
     def _generate_code_with_retries(
         self,
-        chat_model: BaseChatModel,
+        chat_model: BaseLLM,
         messages: List[BaseMessage],
         retries: int = 3,
     ) -> str:
         response = chat_model.predict_messages(messages)
-        if self.logger is not None:
-            self.logger.info(response.content)
+        print(f"-------------------------The model replies:-------------------------\n\n {response.content} \n")
+        #if self.logger is not None:
+         #   self.logger.info(response.content)
         code = AIUtils.extract_code_blocks(response.content)[0]
+        #code = response.content.split("\n")[1].split("Human:")[1].replace("`","")
         try:
+            print(f"-------------------------Spark retrieved sql:-------------------------\n\n {code}\n")
             self.spark.sql(code)
             return code
         except Exception as e:
@@ -61,7 +66,10 @@ def _generate_code_with_retries(
             if self.logger is not None:
                 self.logger.info("Retrying with " + str(retries) + " retries left")
 
-            messages.append(response)
+            # messages.append(response)
+            # Remove retry logic to prevent long response append and ensure accurate model results.
+            
             # append the exception as a HumanMessage into messages
-            messages.append(HumanMessage(content=str(e)))
+            # messages.append(HumanMessage(content=str(e)))
+            # Remove retry logic to prevent long response append and ensure accurate model results.
             return self._generate_code_with_retries(chat_model, messages, retries - 1)