Merge pull request #82 from athina-ai/feature/non-boolean-custom-evals

non boolean custom evals
athina-ai · Jul 5, 2024 · 10780b1 · 10780b1
2 parents 2d4c45d + 3ecdf25
commit 10780b1
Show file tree

Hide file tree

Showing 4 changed files with 125 additions and 33 deletions.
diff --git a/athina/evals/llm/custom_prompt/evaluator.py b/athina/evals/llm/custom_prompt/evaluator.py
@@ -1,3 +1,5 @@
+import time
+from athina.helpers.logger import logger
 from typing import List, Optional, Dict
 from jinja2 import Environment
 from athina.helpers.jinja_helper import PreserveUndefined
@@ -6,14 +8,16 @@
 from ..llm_evaluator import LlmEvaluator
 from athina.evals.eval_type import LlmEvalTypeId
 from ..example import FewShotExample
-
+from athina.interfaces.result import EvalResult, EvalResultMetric
+from athina.metrics.metric_type import MetricType
 
 class CustomPrompt(LlmEvaluator):
     """
     This evaluator can be configured with custom examples and instructions.
     """
 
     _eval_prompt: Optional[str] = None
+    _output_type: Optional[str] = None
     _display_name: str = None
     _metric_ids: List[str] = None
     _model: str = None
@@ -23,6 +27,7 @@ class CustomPrompt(LlmEvaluator):
     def __init__(
         self,
         eval_prompt: str,
+        output_type: str = 'boolean',
         display_name: str = None,
         metric_ids: List[str] = None,
         model: str = None,
@@ -37,6 +42,7 @@ def __init__(
             raise ValueError("model is not defined")
 
         self._eval_prompt = eval_prompt
+        self._output_type = output_type
         self._display_name = display_name
         self._metric_ids = metric_ids
         self._model = model
@@ -95,3 +101,75 @@ def is_failure(self, result) -> Optional[bool]:
     def _user_message(self, **kwargs) -> str:
         template = self.env.from_string(self._user_message_template)
         return template.render(**kwargs)
+
+    def _system_message(self) -> str:
+        if self._output_type == 'boolean':
+            return (
+                "### INSTRUCTIONS ###\n"
+                "You are an expert at evaluating responses by an AI.\n"
+                "Based on the instructions provided, you will evaluate the response and determine if it passes or fails.\n"
+                "You MUST return a JSON object with the following fields:\n"
+                "- result: Result must be either 'Pass' or 'Fail'.\n"
+                "- explanation: An explanation of why the result is Pass or Fail.\n"
+            )
+        elif self._output_type == 'numeric':
+            return (
+                "### INSTRUCTIONS ###\n"
+                "You are an expert at evaluating responses by an AI.\n"
+                "Based on the instructions provided, you will evaluate the response and provide a score.\n"
+                "You MUST return a JSON object with the following fields:\n"
+                "- score: The score based on the provided grading criteria.\n"
+                "- explanation: An explanation of the score.\n"
+            )
+
+    def _evaluate(self, **kwargs) -> EvalResult:
+        """
+        Run the LLM evaluator.
+        """
+        start_time = time.time()
+        # Validate that correct args were passed
+        self.validate_args(**kwargs)
+
+        # Construct Prompt
+        messages = self._prompt_messages(**kwargs)
+
+        # Run the LLM Completion
+
+        chat_completion_response_json: dict = self.llm_service.json_completion(
+            model=self._model,
+            messages=messages,
+            temperature=self.TEMPERATURE,
+        )
+
+        metrics = []
+        try:
+            if self._output_type == 'boolean':
+                result = chat_completion_response_json["result"]
+                explanation = chat_completion_response_json["explanation"]
+                failure = self.is_failure(result)
+                passed_value = 1 - float(failure)
+                metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value))
+            elif self._output_type == 'numeric':
+                score = chat_completion_response_json["score"]
+                explanation = chat_completion_response_json["explanation"]
+                metrics.append(EvalResultMetric(id=MetricType.SCORE.value, value=score))
+                failure = None  # Numeric evaluations don't have a pass/fail result
+
+        except Exception as e:
+            logger.error(f"Error occurred during eval: {e}")
+            raise e
+
+        end_time = time.time()
+        eval_runtime_ms = int((end_time - start_time) * 1000)
+        llm_eval_result = EvalResult(
+            name=self.name,
+            display_name=self.display_name,
+            data=kwargs,
+            failure=failure,
+            reason=explanation,
+            runtime=eval_runtime_ms,
+            model=self._model,
+            metrics=metrics,
+        )
+        return {k: v for k, v in llm_eval_result.items() if v is not None}
+
diff --git a/athina/metrics/metric_type.py b/athina/metrics/metric_type.py
@@ -28,6 +28,7 @@ class MetricType(Enum):
     GROUNDEDNESS = "groundedness"
     PASSED = "passed"
     SIMILARITY_SCORE = "similarity_score"
+    SCORE = "score"
 
     # Conversation Metrics
     CONVERSATION_RESOLUTION = "conversation_resolution"

diff --git a/examples/run_custom_eval.ipynb b/examples/run_custom_eval.ipynb
@@ -24,7 +24,7 @@
                             "import pandas as pd\n",
                             "\n",
                             "OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))\n",
-                            "AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))"
+                            "# AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))"
                      ]
               },
               {
@@ -136,6 +136,13 @@
                      "execution_count": 3,
                      "metadata": {},
                      "outputs": [
+                            {
+                                   "name": "stdout",
+                                   "output_type": "stream",
+                                   "text": [
+                                          "Error logging dataset to Athina: ('Connection aborted.', BadStatusLine('ÿ\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x01\\x7ft\\x01/2.32.3\\r\\n'))\n"
+                                   ]
+                            },
                             {
                                    "data": {
                                           "text/html": [
@@ -166,7 +173,7 @@
                                                  "      <th>grade_reason</th>\n",
                                                  "      <th>runtime</th>\n",
                                                  "      <th>model</th>\n",
-                                                 "      <th>passed</th>\n",
+                                                 "      <th>score</th>\n",
                                                  "    </tr>\n",
                                                  "  </thead>\n",
                                                  "  <tbody>\n",
@@ -176,38 +183,38 @@
                                                  "      <td>[Greece is often called the cradle of Western civilization.]</td>\n",
                                                  "      <td>Athens</td>\n",
                                                  "      <td>None</td>\n",
-                                                 "      <td>Response should answer user's query</td>\n",
-                                                 "      <td>False</td>\n",
-                                                 "      <td>The response provided the correct answer to the user's query.</td>\n",
-                                                 "      <td>1350</td>\n",
+                                                 "      <td>Response should answer user's query coherently</td>\n",
+                                                 "      <td>None</td>\n",
+                                                 "      <td>The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.</td>\n",
+                                                 "      <td>1012</td>\n",
                                                  "      <td>gpt-3.5-turbo</td>\n",
-                                                 "      <td>1.0</td>\n",
+                                                 "      <td>1</td>\n",
                                                  "    </tr>\n",
                                                  "    <tr>\n",
                                                  "      <th>1</th>\n",
                                                  "      <td>What is the price of a Tesla Model 3?</td>\n",
                                                  "      <td>[Tesla Model 3 is a fully electric car.]</td>\n",
                                                  "      <td>I cannot answer this question as prices vary from country to country.</td>\n",
                                                  "      <td>None</td>\n",
-                                                 "      <td>Response should answer user's query</td>\n",
-                                                 "      <td>True</td>\n",
-                                                 "      <td>The response refuses to answer the user's query, which does not meet the criteria for a pass.</td>\n",
-                                                 "      <td>1161</td>\n",
+                                                 "      <td>Response should answer user's query coherently</td>\n",
+                                                 "      <td>None</td>\n",
+                                                 "      <td>The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment.</td>\n",
+                                                 "      <td>1136</td>\n",
                                                  "      <td>gpt-3.5-turbo</td>\n",
-                                                 "      <td>0.0</td>\n",
+                                                 "      <td>1</td>\n",
                                                  "    </tr>\n",
                                                  "    <tr>\n",
                                                  "      <th>2</th>\n",
                                                  "      <td>What is a shooting star?</td>\n",
                                                  "      <td>[Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]</td>\n",
                                                  "      <td>A shooting star is a meteor that burns up in the atmosphere.</td>\n",
                                                  "      <td>None</td>\n",
-                                                 "      <td>Response should answer user's query</td>\n",
-                                                 "      <td>False</td>\n",
-                                                 "      <td>The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere.</td>\n",
-                                                 "      <td>1232</td>\n",
+                                                 "      <td>Response should answer user's query coherently</td>\n",
+                                                 "      <td>None</td>\n",
+                                                 "      <td>The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.</td>\n",
+                                                 "      <td>1074</td>\n",
                                                  "      <td>gpt-3.5-turbo</td>\n",
-                                                 "      <td>1.0</td>\n",
+                                                 "      <td>1</td>\n",
                                                  "    </tr>\n",
                                                  "  </tbody>\n",
                                                  "</table>\n",
@@ -229,20 +236,20 @@
                                                  "1  I cannot answer this question as prices vary from country to country.   \n",
                                                  "2           A shooting star is a meteor that burns up in the atmosphere.   \n",
                                                  "\n",
-                                                 "  expected_response                         display_name  failed  \\\n",
-                                                 "0              None  Response should answer user's query   False   \n",
-                                                 "1              None  Response should answer user's query    True   \n",
-                                                 "2              None  Response should answer user's query   False   \n",
+                                                 "  expected_response                                    display_name failed  \\\n",
+                                                 "0              None  Response should answer user's query coherently   None   \n",
+                                                 "1              None  Response should answer user's query coherently   None   \n",
+                                                 "2              None  Response should answer user's query coherently   None   \n",
                                                  "\n",
-                                                 "                                                                                                                             grade_reason  \\\n",
-                                                 "0                                                                           The response provided the correct answer to the user's query.   \n",
-                                                 "1                                           The response refuses to answer the user's query, which does not meet the criteria for a pass.   \n",
-                                                 "2  The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere.   \n",
+                                                 "                                                                                                                              grade_reason  \\\n",
+                                                 "0    The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.   \n",
+                                                 "1  The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment.   \n",
+                                                 "2    The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.   \n",
                                                  "\n",
-                                                 "   runtime          model  passed  \n",
-                                                 "0     1350  gpt-3.5-turbo     1.0  \n",
-                                                 "1     1161  gpt-3.5-turbo     0.0  \n",
-                                                 "2     1232  gpt-3.5-turbo     1.0  "
+                                                 "   runtime          model  score  \n",
+                                                 "0     1012  gpt-3.5-turbo      1  \n",
+                                                 "1     1136  gpt-3.5-turbo      1  \n",
+                                                 "2     1074  gpt-3.5-turbo      1  "
                                           ]
                                    },
                                    "execution_count": 3,
@@ -254,14 +261,20 @@
                             "# Checks if the LLM response answers the user query sufficiently\n",
                             "eval_model = \"gpt-3.5-turbo\"\n",
                             "eval_prompt = \"\"\"\n",
-                            "If the response refuses to answer the user's query, then fail. Otherwise pass.\n",
+                            "Based on the coherence of response, give the score ranging from 1 to 5.\n",
+                            "\n",
+                            "User Query: {query}\n",
+                            "Response: {response}\"\"\"\n",
+                            "eval_prompt_1 = \"\"\"\n",
+                            "If response answers the query, then pass otherwise fail.\n",
                             "\n",
                             "User Query: {query}\n",
                             "Response: {response}\"\"\"\n",
                             "CustomPrompt(\n",
                             "    eval_prompt=eval_prompt, \n",
+                            "    output_type='numeric',\n",
                             "    model=eval_model, \n",
-                            "    display_name=\"Response should answer user's query\",\n",
+                            "    display_name=\"Response should answer user's query coherently\",\n",
                             ").run_batch(data=dataset).to_df()"
                      ]
               }

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "athina"
-version = "1.4.24"
+version = "1.4.25"
 description = "Python SDK to configure and run evaluations for your LLM-based application"
 authors = ["Shiv Sakhuja <[email protected]>", "Akshat Gupta <[email protected]>", "Vivek Aditya <[email protected]>", "Akhil Bisht <[email protected]>"]
 readme = "README.md"