Use inconsistent format for similarity search input (pyspark-ai#166)

asl3 · web-flow · commit 46f9cec39c13 · 2023-10-17T11:01:27.000-07:00
Modify the queries and query column input format to be inconsistent in
`vector_similarity_search.ipynb`, to demonstrate how similarity search
tool can help with unpredictable input format.
diff --git a/examples/vector_similarity_search.ipynb b/examples/vector_similarity_search.ipynb
@@ -30,7 +30,7 @@
      "text": [
       "Setting default log level to \"WARN\".\n",
       "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
-      "23/10/16 21:04:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
+      "23/10/17 13:50:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
      ]
     }
    ],
@@ -53,17 +53,17 @@
    },
    "outputs": [],
    "source": [
-    "# create a dataframe\n",
+    "# create a dataframe with inconsistent date formats\n",
     "scores_df = spark_ai._spark.createDataFrame(\n",
     "    [\n",
     "        (\"09-10-2017\", \"24-20\", \"New England Patriots\"),\n",
-    "        (\"09-17-2017\", \"27-23\", \"Green Bay Packers\"),\n",
-    "        (\"09-24-2017\", \"30-26\", \"Kansas City Chiefs\"),\n",
+    "        (\"2017-17-09\", \"27-23\", \"Green Bay Packers\"),\n",
+    "        (\"09/24/2017\", \"30-26\", \"Kansas City Chiefs\"),\n",
     "        (\"10-01-2017\", \"26-24\", \"Los Angeles Rams\"),\n",
-    "        (\"10-08-2017\", \"23-10\", \"Baltimore Ravens\"),\n",
-    "        (\"10-15-2017\", \"20-17\", \"Miami Dolphins\"),\n",
+    "        (\"10/08/2017\", \"23-10\", \"Baltimore Ravens\"),\n",
+    "        (\"10/15/17\", \"20-17\", \"Miami Dolphins\"),\n",
     "        (\"10-22-2017\", \"33-31\", \"Chicago Bears\"),\n",
-    "        (\"10-29-2017\", \"27-24\", \"Philadelphia Eagles\"),\n",
+    "        (\"2017-10-29\", \"27-24\", \"Philadelphia Eagles\"),\n",
     "        (\"11-05-2017\", \"28-23\", \"Carolina Panthers\"),\n",
     "        (\"11-12-2017\", \"27-24\", \"New Orleans Saints\")\n",
     "    ],\n",
@@ -86,7 +86,7 @@
      "output_type": "stream",
      "text": [
       "\u001b[92mINFO: \u001b[0mCreating temp view for the transform:\n",
-      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view_170570205\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
+      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view_1345058194\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
       "\n"
      ]
     },
@@ -104,19 +104,18 @@
       "\n",
       "\n",
       "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
-      "\u001b[32;1m\u001b[1;3mThought: The column names are non-descriptive, but from the sample values I see that column `date` contains dates\n",
-      "and column `winner` contains team names. So, I will filter on column `date` for 'October 29th' and select column `winner` for the team.\n",
-      "I will use = rather than \"like\" in my SQL query because I need an exact match. I'll use the similar_value tool to help\n",
-      "me choose an exact filter value for `date`.\n",
+      "\u001b[32;1m\u001b[1;3mThought: The keyword 'October 29th' is most similar to the sample values in the `date` column.\n",
+      "I need to filter on an exact value from the `date` column, so I will use the tool similar_value to help me choose my filter value.\n",
       "Action: similar_value\n",
-      "Action Input: October 29th|date|spark_ai_temp_view_170570205\u001b[0m\n",
-      "Observation: \u001b[38;5;200m\u001b[1;3m10-29-2017\u001b[0m\n",
-      "Thought:\u001b[32;1m\u001b[1;3mThe correct `date` filter should be '10-29-2017' because it is semantically closest to the keyword.\n",
+      "Action Input: October 29th|date|spark_ai_temp_view_1345058194\u001b[0m\n",
+      "Observation: \u001b[38;5;200m\u001b[1;3m2017-10-29\u001b[0m\n",
+      "Thought:\u001b[32;1m\u001b[1;3mThe correct `date` filter should be '2017-10-29' because it is semantically closest to the keyword.\n",
+      "I will use the column `date` to filter the rows where its value is '2017-10-29' and then select the `winner`.\n",
       "Action: query_validation\n",
-      "Action Input: SELECT `winner` FROM `spark_ai_temp_view_170570205` WHERE `date` = '10-29-2017'\u001b[0m\n",
+      "Action Input: SELECT `winner` FROM `spark_ai_temp_view_1345058194` WHERE `date` = '2017-10-29'\u001b[0m\n",
       "Observation: \u001b[33;1m\u001b[1;3mOK\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mI now know the final answer.\n",
-      "Final Answer: SELECT `winner` FROM `spark_ai_temp_view_170570205` WHERE `date` = '10-29-2017'\u001b[0m\n",
+      "Final Answer: SELECT `winner` FROM `spark_ai_temp_view_1345058194` WHERE `date` = '2017-10-29'\u001b[0m\n",
       "\n",
       "\u001b[1m> Finished chain.\u001b[0m\n",
       "+-------------------+\n",
@@ -142,24 +141,24 @@
      "output_type": "stream",
      "text": [
       "\u001b[92mINFO: \u001b[0mCreating temp view for the transform:\n",
-      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view_170570205\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
+      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view_1345058194\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
       "\n",
       "\n",
       "\n",
       "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
-      "\u001b[32;1m\u001b[1;3mThought: The keyword '28 v 23' is most similar to the sample values in the `score` column.\n",
+      "\u001b[32;1m\u001b[1;3mThought: The keyword '23 v 28' is most similar to the sample values in the `score` column.\n",
       "I need to filter on an exact value from the `score` column, so I will use the tool similar_value to help me choose my filter value.\n",
       "Action: similar_value\n",
-      "Action Input: 28 v 23|score|spark_ai_temp_view_170570205\u001b[0m\n",
+      "Action Input: 23 v 28|score|spark_ai_temp_view_1345058194\u001b[0m\n",
       "Observation: \u001b[38;5;200m\u001b[1;3m28-23\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mThe correct `score` filter should be '28-23' because it is semantically closest to the keyword.\n",
       "I will use the column `score` to filter the rows where its value is '28-23' and then select the `date` \n",
       "because the question asks for the date when the score was '28-23'.\n",
       "Action: query_validation\n",
-      "Action Input: SELECT `date` FROM `spark_ai_temp_view_170570205` WHERE `score` = '28-23'\u001b[0m\n",
+      "Action Input: SELECT `date` FROM `spark_ai_temp_view_1345058194` WHERE `score` = '28-23'\u001b[0m\n",
       "Observation: \u001b[33;1m\u001b[1;3mOK\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mI now know the final answer.\n",
-      "Final Answer: SELECT `date` FROM `spark_ai_temp_view_170570205` WHERE `score` = '28-23'\u001b[0m\n",
+      "Final Answer: SELECT `date` FROM `spark_ai_temp_view_1345058194` WHERE `score` = '28-23'\u001b[0m\n",
       "\n",
       "\u001b[1m> Finished chain.\u001b[0m\n",
       "+----------+\n",
@@ -172,7 +171,7 @@
     }
    ],
    "source": [
-    "scores_df.ai.transform(desc=\"When was the score 28 v 23?\", cache=False).show()"
+    "scores_df.ai.transform(desc=\"When was the score 23 v 28?\", cache=False).show()"
    ]
   },
   {
@@ -185,7 +184,7 @@
      "output_type": "stream",
      "text": [
       "\u001b[92mINFO: \u001b[0mCreating temp view for the transform:\n",
-      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view_170570205\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
+      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view_1345058194\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
       "\n",
       "\n",
       "\n",
@@ -195,20 +194,20 @@
       "I will use = rather than \"like\" in my SQL query because I need an exact match. I'll use the similar_value tool to help\n",
       "me choose an exact filter value for `winner`.\n",
       "Action: similar_value\n",
-      "Action Input: Chiefs|winner|spark_ai_temp_view_170570205\u001b[0m\n",
+      "Action Input: Chiefs|winner|spark_ai_temp_view_1345058194\u001b[0m\n",
       "Observation: \u001b[38;5;200m\u001b[1;3mKansas City Chiefs\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mThe correct `winner` filter should be 'Kansas City Chiefs' because it is semantically closest to the keyword.\n",
       "Action: query_validation\n",
-      "Action Input: SELECT `date` FROM `spark_ai_temp_view_170570205` WHERE `winner` = 'Kansas City Chiefs'\u001b[0m\n",
+      "Action Input: SELECT `date` FROM `spark_ai_temp_view_1345058194` WHERE `winner` = 'Kansas City Chiefs'\u001b[0m\n",
       "Observation: \u001b[33;1m\u001b[1;3mOK\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mI now know the final answer.\n",
-      "Final Answer: SELECT `date` FROM `spark_ai_temp_view_170570205` WHERE `winner` = 'Kansas City Chiefs'\u001b[0m\n",
+      "Final Answer: SELECT `date` FROM `spark_ai_temp_view_1345058194` WHERE `winner` = 'Kansas City Chiefs'\u001b[0m\n",
       "\n",
       "\u001b[1m> Finished chain.\u001b[0m\n",
       "+----------+\n",
       "|      date|\n",
       "+----------+\n",
-      "|09-24-2017|\n",
+      "|09/24/2017|\n",
       "+----------+\n",
       "\n"
      ]
@@ -254,24 +253,24 @@
      "output_type": "stream",
      "text": [
       "\u001b[92mINFO: \u001b[0mCreating temp view for the transform:\n",
-      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view_1020457715\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
+      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view__1575533453\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
       "\n",
       "\n",
       "\n",
       "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
       "\u001b[32;1m\u001b[1;3mThought: The keyword '38,000,000' is most similar to the sample values in the `population` column.\n",
       "I need to filter on an exact value from the `population` column, so I will use the tool similar_value to help me choose my filter value.\n",
       "Action: similar_value\n",
-      "Action Input: 38,000,000|population|spark_ai_temp_view_1020457715\u001b[0m\n",
+      "Action Input: 38,000,000|population|spark_ai_temp_view__1575533453\u001b[0m\n",
       "Observation: \u001b[38;5;200m\u001b[1;3m38 million\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mThe correct `population` filter should be '38 million' because it is semantically closest to the keyword.\n",
       "I will use the column `population` to filter the rows where its value is '38 million' and then select the `gdp` \n",
       "because the question asks for the GDP of the country.\n",
       "Action: query_validation\n",
-      "Action Input: SELECT `gdp` FROM `spark_ai_temp_view_1020457715` WHERE `population` = '38 million'\u001b[0m\n",
+      "Action Input: SELECT `gdp` FROM `spark_ai_temp_view__1575533453` WHERE `population` = '38 million'\u001b[0m\n",
       "Observation: \u001b[33;1m\u001b[1;3mOK\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mI now know the final answer.\n",
-      "Final Answer: SELECT `gdp` FROM `spark_ai_temp_view_1020457715` WHERE `population` = '38 million'\u001b[0m\n",
+      "Final Answer: SELECT `gdp` FROM `spark_ai_temp_view__1575533453` WHERE `population` = '38 million'\u001b[0m\n",
       "\n",
       "\u001b[1m> Finished chain.\u001b[0m\n",
       "+-----------------+\n",
@@ -297,24 +296,24 @@
      "output_type": "stream",
      "text": [
       "\u001b[92mINFO: \u001b[0mCreating temp view for the transform:\n",
-      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view_1020457715\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
+      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view__1575533453\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
       "\n",
       "\n",
       "\n",
       "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
       "\u001b[32;1m\u001b[1;3mThought: The keyword '.7 percent population growth' is most similar to the sample values in the `population growth (%)` column.\n",
       "I need to filter on an exact value from the `population growth (%)` column, so I will use the tool similar_value to help me choose my filter value.\n",
       "Action: similar_value\n",
-      "Action Input: .7 percent population growth|population growth (%)|spark_ai_temp_view_1020457715\u001b[0m\n",
+      "Action Input: .7|population growth (%)|spark_ai_temp_view__1575533453\u001b[0m\n",
       "Observation: \u001b[38;5;200m\u001b[1;3m0.7\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mThe correct `population growth (%)` filter should be '0.7' because it is semantically closest to the keyword.\n",
       "I will use the column `population growth (%)` to filter the rows where its value is '0.7' and then select the AVG(`gdp growth (%)`) \n",
       "because the question asks for the average GDP growth.\n",
       "Action: query_validation\n",
-      "Action Input: SELECT AVG(`gdp growth (%)`) FROM `spark_ai_temp_view_1020457715` WHERE `population growth (%)` = 0.7\u001b[0m\n",
+      "Action Input: SELECT AVG(`gdp growth (%)`) FROM `spark_ai_temp_view__1575533453` WHERE `population growth (%)` = 0.7\u001b[0m\n",
       "Observation: \u001b[33;1m\u001b[1;3mOK\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mI now know the final answer.\n",
-      "Final Answer: SELECT AVG(`gdp growth (%)`) FROM `spark_ai_temp_view_1020457715` WHERE `population growth (%)` = 0.7\u001b[0m\n",
+      "Final Answer: SELECT AVG(`gdp growth (%)`) FROM `spark_ai_temp_view__1575533453` WHERE `population growth (%)` = 0.7\u001b[0m\n",
       "\n",
       "\u001b[1m> Finished chain.\u001b[0m\n",
       "+-------------------+\n",
@@ -340,28 +339,28 @@
      "output_type": "stream",
      "text": [
       "\u001b[92mINFO: \u001b[0mCreating temp view for the transform:\n",
-      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view_1020457715\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
+      "df.createOrReplaceTempView(\u001b[33m\"\u001b[39;49;00m\u001b[33mspark_ai_temp_view__1575533453\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\u001b[37m\u001b[39;49;00m\n",
       "\n",
       "\n",
       "\n",
       "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
       "\u001b[32;1m\u001b[1;3mThought: The keyword 'Japan' is most similar to the sample values in the `country` column.\n",
       "I need to filter on an exact value from the `country` column, so I will use the tool similar_value to help me choose my filter value.\n",
       "Action: similar_value\n",
-      "Action Input: Japan|country|spark_ai_temp_view_1020457715\u001b[0m\n",
+      "Action Input: Japan|country|spark_ai_temp_view__1575533453\u001b[0m\n",
       "Observation: \u001b[38;5;200m\u001b[1;3mJPN\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mThe correct `country` filter should be 'JPN' because it is semantically closest to the keyword.\n",
       "I will use the column `country` to filter the rows where its value is 'JPN' and then select the `population growth (%)` \n",
       "because the question asks for the countries with higher population growth than Japan.\n",
       "Action: query_sql_db\n",
-      "Action Input: SELECT `population growth (%)` FROM `spark_ai_temp_view_1020457715` WHERE `country` = 'JPN'\u001b[0m\n",
+      "Action Input: \"SELECT `population growth (%)` FROM spark_ai_temp_view__1575533453 WHERE `country` = 'JPN'\"\u001b[0m\n",
       "Observation: \u001b[36;1m\u001b[1;3m[('0.2',)]\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mThe population growth of Japan is 0.2%. Now I will write a query to find the countries with a higher population growth.\n",
       "Action: query_validation\n",
-      "Action Input: SELECT `country` FROM `spark_ai_temp_view_1020457715` WHERE `population growth (%)` > 0.2\u001b[0m\n",
+      "Action Input: SELECT `country` FROM `spark_ai_temp_view__1575533453` WHERE `population growth (%)` > 0.2\u001b[0m\n",
       "Observation: \u001b[33;1m\u001b[1;3mOK\u001b[0m\n",
       "Thought:\u001b[32;1m\u001b[1;3mI now know the final answer.\n",
-      "Final Answer: SELECT `country` FROM `spark_ai_temp_view_1020457715` WHERE `population growth (%)` > 0.2\u001b[0m\n",
+      "Final Answer: SELECT `country` FROM `spark_ai_temp_view__1575533453` WHERE `population growth (%)` > 0.2\u001b[0m\n",
       "\n",
       "\u001b[1m> Finished chain.\u001b[0m\n",
       "+-------+\n",