Merge pull request sinaptik-ai#16 from yzaparto/zaparto/error-correcting-framework

gventuri · web-flow · commit ce303eeb3daf · 2023-05-02T00:30:21.000+02:00
Error Correcting Framework
diff --git a/.pylintrc b/.pylintrc
@@ -1 +1,2 @@
+[MASTER]
 ignore=test_*
diff --git a/examples/data/sample_dataframe.py b/examples/data/sample_dataframe.py
@@ -0,0 +1,29 @@
+"""Sample data for dataframe examples."""
+
+dataframe = {
+    "country": [
+        "United States",
+        "United Kingdom",
+        "France",
+        "Germany",
+        "Italy",
+        "Spain",
+        "Canada",
+        "Australia",
+        "Japan",
+        "China",
+    ],
+    "gdp": [
+        21400000,
+        2940000,
+        2830000,
+        3870000,
+        2160000,
+        1350000,
+        1780000,
+        1320000,
+        516000,
+        14000000,
+    ],
+    "happiness_index": [7.3, 7.2, 6.5, 7.0, 6.0, 6.3, 7.3, 7.3, 5.9, 5.0],
+}
diff --git a/examples/from_dataframe.py b/examples/from_dataframe.py
@@ -3,36 +3,9 @@
 import pandas as pd
 from pandasai import PandasAI
 from pandasai.llm.openai import OpenAI
+from .data.sample_dataframe import dataframe
 
-df = pd.DataFrame(
-    {
-        "country": [
-            "United States",
-            "United Kingdom",
-            "France",
-            "Germany",
-            "Italy",
-            "Spain",
-            "Canada",
-            "Australia",
-            "Japan",
-            "China",
-        ],
-        "gdp": [
-            21400000,
-            2940000,
-            2830000,
-            3870000,
-            2160000,
-            1350000,
-            1780000,
-            1320000,
-            516000,
-            14000000,
-        ],
-        "happiness_index": [7.3, 7.2, 6.5, 7.0, 6.0, 6.3, 7.3, 7.3, 5.9, 5.0],
-    }
-)
+df = pd.DataFrame(dataframe)
 
 llm = OpenAI()
 pandas_ai = PandasAI(llm, verbose=True, conversational=False)
diff --git a/examples/with_privacy_enforced.py b/examples/with_privacy_enforced.py
@@ -3,44 +3,15 @@
 import pandas as pd
 from pandasai import PandasAI
 from pandasai.llm.openai import OpenAI
+from .data.sample_dataframe import dataframe
 
-df = pd.DataFrame(
-    {
-        "country": [
-            "United States",
-            "United Kingdom",
-            "France",
-            "Germany",
-            "Italy",
-            "Spain",
-            "Canada",
-            "Australia",
-            "Japan",
-            "China",
-        ],
-        "gdp": [
-            21400000,
-            2940000,
-            2830000,
-            3870000,
-            2160000,
-            1350000,
-            1780000,
-            1320000,
-            516000,
-            14000000,
-        ],
-        "happiness_index": [7.3, 7.2, 6.5, 7.0, 6.0, 6.3, 7.3, 7.3, 5.9, 5.0],
-    }
-)
+df = pd.DataFrame(dataframe)
 
 llm = OpenAI()
-pandas_ai = PandasAI(llm, verbose=True, conversational=False)
+pandas_ai = PandasAI(llm, verbose=True, conversational=False, enforce_privacy=True)
 response = pandas_ai.run(
     df,
     "Calculate the sum of the gdp of north american countries",
-    enforce_privacy=True,
-    is_conversational_answer=True,
 )
 print(response)
 # Output: 26200000
diff --git a/pandasai/__init__.py b/pandasai/__init__.py
@@ -15,23 +15,41 @@ class PandasAI:
 This is the result of `print(df.head({rows_to_display}))`:
 {df_head}.
 
-Return the python code (do not import anything) to get the answer to the following question:
+Return the python code (do not import anything) and make sure to prefix the python code with <startCode> exactly and suffix the code with <endCode> exactly 
+to get the answer to the following question :
 """
     _response_instruction: str = """
 Question: {question}
 Answer: {answer}
 
 Rewrite the answer to the question in a conversational way.
 """
+
+    _error_correct_instruction: str = """
+    For the task defined below:
+    {orig_task}
+    you generated this python code:
+    {code}
+    and this fails with the following error:
+    {error_returned}
+    Correct the python code and return a new python code (do not import anything) that fixes the above mentioned error.
+    Make sure to prefix the python code with <startCode> exactly and suffix the code with <endCode> exactly.
+    """
     _llm: LLM
     _verbose: bool = False
     _is_conversational_answer: bool = True
     _enforce_privacy: bool = False
+    _max_retries: int = 3
+    _original_instruction_and_prompt = None
     last_code_generated: str = None
     code_output: str = None
 
     def __init__(
-        self, llm=None, conversational=True, verbose=False, enforce_privacy=False
+        self,
+        llm=None,
+        conversational=True,
+        verbose=False,
+        enforce_privacy=False,
     ):
         if llm is None:
             raise LLMNotFoundError(
@@ -74,6 +92,13 @@ def run(
             ),
             prompt,
         )
+        self._original_instruction_and_prompt = (
+            self._task_instruction.format(
+                df_head=data_frame.head(rows_to_display),
+                rows_to_display=rows_to_display,
+            )
+            + prompt
+        )
         self.last_code_generated = code
         self.log(
             f"""
@@ -83,7 +108,7 @@ def run(
 ```"""
         )
 
-        answer = self.run_code(code, data_frame)
+        answer = self.run_code(code, data_frame, False)
         self.code_output = answer
         self.log(f"Answer: {answer}")
 
@@ -95,7 +120,10 @@ def run(
         return answer
 
     def run_code(
-        self, code: str, df: pd.DataFrame  # pylint: disable=W0613 disable=C0103
+        self,
+        code: str,
+        df: pd.DataFrame,  # pylint: disable=W0613 disable=C0103
+        use_error_correction_framework: bool = False,
     ) -> str:
         # pylint: disable=W0122 disable=W0123 disable=W0702:bare-except
         """Run the code in the current context and return the result"""
@@ -105,7 +133,28 @@ def run_code(
         sys.stdout = output
 
         # Execute the code
-        exec(code)
+        if use_error_correction_framework:
+            count = 0
+            code_to_run = code
+            while count < self._max_retries:
+                try:
+                    exec(code_to_run)
+                    code = code_to_run
+                    break
+                except Exception as e:  # pylint: disable=W0718 disable=C0103
+                    count += 1
+                    error_correcting_instruction = (
+                        self._error_correct_instruction.format(
+                            orig_task=self._original_instruction_and_prompt,
+                            code=code,
+                            error_returned=e,
+                        )
+                    )
+                    code_to_run = self._llm.generate_code(
+                        error_correcting_instruction, ""
+                    )
+        else:
+            exec(code)
 
         # Restore standard output and get the captured output
         sys.stdout = sys.__stdout__
diff --git a/pandasai/llm/base.py b/pandasai/llm/base.py
@@ -59,6 +59,8 @@ def _extract_code(self, response: str, separator: str = "```") -> str:
         code = response
         if len(response.split(separator)) > 1:
             code = response.split(separator)[1]
+        if re.match(r"<startCode>([\s\S]*?)<\/?endCode>", code):
+            code = re.findall(r"<startCode>([\s\S]*?)<\/?endCode>", code)[0]
         code = self._polish_code(code)
         if not self._is_python_code(code):
             raise NoCodeFoundError("No code found in the response")
diff --git a/tests/test_pandasai.py b/tests/test_pandasai.py
@@ -139,7 +139,8 @@ def test_run_with_privacy_enforcement(self):
 Columns: [country]
 Index: [].
 
-Return the python code (do not import anything) to get the answer to the following question:
+Return the python code (do not import anything) and make sure to prefix the python code with <startCode> exactly and suffix the code with <endCode> exactly 
+to get the answer to the following question :
 How many countries are in the dataframe?"""
         self.pandasai.run(df, "How many countries are in the dataframe?")
         assert self.pandasai._llm.last_prompt == expected_prompt
@@ -159,7 +160,8 @@ def test_run_without_privacy_enforcement(self):
 1  United Kingdom
 2          France.
 
-Return the python code (do not import anything) to get the answer to the following question:
+Return the python code (do not import anything) and make sure to prefix the python code with <startCode> exactly and suffix the code with <endCode> exactly 
+to get the answer to the following question :
 How many countries are in the dataframe?"""
         self.pandasai.run(df, "How many countries are in the dataframe?")
-        assert self.pandasai._llm.last_prompt == expected_prompt
+        assert self.pandasai._llm.last_prompt == expected_prompt