From c258d36c956a7d3d5b8fd3c5f13336f2c4c611ff Mon Sep 17 00:00:00 2001
From: Akshat Gupta <akshatwillrock@gmail.com>
Date: Tue, 2 Jul 2024 10:19:05 +0530
Subject: [PATCH 1/4] add output type to custom evals

---
 athina/evals/llm/custom_prompt/evaluator.py | 3 +++
 athina/evals/llm/llm_evaluator.py           | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/athina/evals/llm/custom_prompt/evaluator.py b/athina/evals/llm/custom_prompt/evaluator.py
index 8efd95b..6dec50c 100644
--- a/athina/evals/llm/custom_prompt/evaluator.py
+++ b/athina/evals/llm/custom_prompt/evaluator.py
@@ -14,6 +14,7 @@ class CustomPrompt(LlmEvaluator):
     """
 
     _eval_prompt: Optional[str] = None
+    _output_type: Optional[str] = None
     _display_name: str = None
     _metric_ids: List[str] = None
     _model: str = None
@@ -23,6 +24,7 @@ class CustomPrompt(LlmEvaluator):
     def __init__(
         self,
         eval_prompt: str,
+        output_type: str = None,
         display_name: str = None,
         metric_ids: List[str] = None,
         model: str = None,
@@ -37,6 +39,7 @@ def __init__(
             raise ValueError("model is not defined")
 
         self._eval_prompt = eval_prompt
+        self._output_type = output_type
         self._display_name = display_name
         self._metric_ids = metric_ids
         self._model = model
diff --git a/athina/evals/llm/llm_evaluator.py b/athina/evals/llm/llm_evaluator.py
index d430001..ebd168a 100644
--- a/athina/evals/llm/llm_evaluator.py
+++ b/athina/evals/llm/llm_evaluator.py
@@ -1,4 +1,4 @@
-import traceback
+stuimport traceback
 from abc import ABC, abstractmethod
 import time
 from typing import List, Optional

From bc020156c8903f496b690b0e3cca7e040c8eef65 Mon Sep 17 00:00:00 2001
From: Akshat Gupta <akshatwillrock@gmail.com>
Date: Wed, 3 Jul 2024 10:58:34 +0530
Subject: [PATCH 2/4] add output type logic to custom prompt

---
 athina/evals/llm/custom_prompt/evaluator.py | 79 ++++++++++++++++++++-
 athina/evals/llm/llm_evaluator.py           |  2 +-
 athina/metrics/metric_type.py               |  1 +
 3 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/athina/evals/llm/custom_prompt/evaluator.py b/athina/evals/llm/custom_prompt/evaluator.py
index 6dec50c..f85ad22 100644
--- a/athina/evals/llm/custom_prompt/evaluator.py
+++ b/athina/evals/llm/custom_prompt/evaluator.py
@@ -1,3 +1,5 @@
+import time
+from athina.helpers.logger import logger
 from typing import List, Optional, Dict
 from jinja2 import Environment
 from athina.helpers.jinja_helper import PreserveUndefined
@@ -6,7 +8,8 @@
 from ..llm_evaluator import LlmEvaluator
 from athina.evals.eval_type import LlmEvalTypeId
 from ..example import FewShotExample
-
+from athina.interfaces.result import EvalResult, EvalResultMetric
+from athina.metrics.metric_type import MetricType
 
 class CustomPrompt(LlmEvaluator):
     """
@@ -98,3 +101,77 @@ def is_failure(self, result) -> Optional[bool]:
     def _user_message(self, **kwargs) -> str:
         template = self.env.from_string(self._user_message_template)
         return template.render(**kwargs)
+
+    def _system_message(self) -> str:
+        if self._output_type == 'boolean':
+            return (
+                "### INSTRUCTIONS ###\n"
+                "You are an expert at evaluating responses by an AI.\n"
+                "Based on the instructions provided, you will evaluate the response and determine if it passes or fails.\n"
+                "You MUST return a JSON object with the following fields:\n"
+                "- result: Result must be either 'Pass' or 'Fail'.\n"
+                "- explanation: An explanation of why the result is Pass or Fail.\n"
+            )
+        elif self._output_type == 'numeric':
+            return (
+                "### INSTRUCTIONS ###\n"
+                "You are an expert at evaluating responses by an AI.\n"
+                "Based on the instructions provided, you will evaluate the response and provide a score.\n"
+                "You MUST return a JSON object with the following fields:\n"
+                "- score: The score based on the provided grading criteria.\n"
+                "- explanation: An explanation of the score.\n"
+            )
+        else:
+            return super()._system_message()
+
+    def _evaluate(self, **kwargs) -> EvalResult:
+        """
+        Run the LLM evaluator.
+        """
+        start_time = time.time()
+        # Validate that correct args were passed
+        self.validate_args(**kwargs)
+
+        # Construct Prompt
+        messages = self._prompt_messages(**kwargs)
+
+        # Run the LLM Completion
+
+        chat_completion_response_json: dict = self.llm_service.json_completion(
+            model=self._model,
+            messages=messages,
+            temperature=self.TEMPERATURE,
+        )
+    
+        metrics = []
+        try:
+            if self._output_type == 'boolean':
+                result = chat_completion_response_json["result"]
+                explanation = chat_completion_response_json["explanation"]
+                failure = self.is_failure(result)
+                passed_value = 1 - float(failure)
+                metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value))
+            elif self._output_type == 'numeric':
+                score = chat_completion_response_json["score"]
+                explanation = chat_completion_response_json["explanation"]
+                metrics.append(EvalResultMetric(id=MetricType.CUSTOM_PROMPT_SCORE.value, value=score))
+                failure = None  # Numeric evaluations don't have a pass/fail result
+
+        except Exception as e:
+            logger.error(f"Error occurred during eval: {e}")
+            raise e
+
+        end_time = time.time()
+        eval_runtime_ms = int((end_time - start_time) * 1000)
+        llm_eval_result = EvalResult(
+            name=self.name,
+            display_name=self.display_name,
+            data=kwargs,
+            failure=failure,
+            reason=explanation,
+            runtime=eval_runtime_ms,
+            model=self._model,
+            metrics=metrics,
+        )
+        return {k: v for k, v in llm_eval_result.items() if v is not None}
+
diff --git a/athina/evals/llm/llm_evaluator.py b/athina/evals/llm/llm_evaluator.py
index ebd168a..d430001 100644
--- a/athina/evals/llm/llm_evaluator.py
+++ b/athina/evals/llm/llm_evaluator.py
@@ -1,4 +1,4 @@
-stuimport traceback
+import traceback
 from abc import ABC, abstractmethod
 import time
 from typing import List, Optional
diff --git a/athina/metrics/metric_type.py b/athina/metrics/metric_type.py
index 2a5c026..f1599e6 100644
--- a/athina/metrics/metric_type.py
+++ b/athina/metrics/metric_type.py
@@ -28,6 +28,7 @@ class MetricType(Enum):
     GROUNDEDNESS = "groundedness"
     PASSED = "passed"
     SIMILARITY_SCORE = "similarity_score"
+    CUSTOM_PROMPT_SCORE = "custom_prompt_score"
 
     # Conversation Metrics
     CONVERSATION_RESOLUTION = "conversation_resolution"

From cc365a40cea202ab0ef0de0b0b7417dfa3a24856 Mon Sep 17 00:00:00 2001
From: Akshat Gupta <akshatwillrock@gmail.com>
Date: Wed, 3 Jul 2024 12:55:51 +0530
Subject: [PATCH 3/4] add output type logic to custom prompt

---
 athina/evals/llm/custom_prompt/evaluator.py |   6 +-
 athina/metrics/metric_type.py               |   2 +-
 examples/run_custom_eval.ipynb              | 121 +++++++++++---------
 3 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/athina/evals/llm/custom_prompt/evaluator.py b/athina/evals/llm/custom_prompt/evaluator.py
index f85ad22..440c013 100644
--- a/athina/evals/llm/custom_prompt/evaluator.py
+++ b/athina/evals/llm/custom_prompt/evaluator.py
@@ -148,13 +148,17 @@ def _evaluate(self, **kwargs) -> EvalResult:
             if self._output_type == 'boolean':
                 result = chat_completion_response_json["result"]
                 explanation = chat_completion_response_json["explanation"]
+                print(f"result: {result}")
+                print(f"explanation: {explanation}")
                 failure = self.is_failure(result)
                 passed_value = 1 - float(failure)
                 metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value))
             elif self._output_type == 'numeric':
                 score = chat_completion_response_json["score"]
                 explanation = chat_completion_response_json["explanation"]
-                metrics.append(EvalResultMetric(id=MetricType.CUSTOM_PROMPT_SCORE.value, value=score))
+                print(f"score: {score}")
+                print(f"explanation: {explanation}")
+                metrics.append(EvalResultMetric(id=MetricType.SCORE.value, value=score))
                 failure = None  # Numeric evaluations don't have a pass/fail result
 
         except Exception as e:
diff --git a/athina/metrics/metric_type.py b/athina/metrics/metric_type.py
index f1599e6..77106f8 100644
--- a/athina/metrics/metric_type.py
+++ b/athina/metrics/metric_type.py
@@ -28,7 +28,7 @@ class MetricType(Enum):
     GROUNDEDNESS = "groundedness"
     PASSED = "passed"
     SIMILARITY_SCORE = "similarity_score"
-    CUSTOM_PROMPT_SCORE = "custom_prompt_score"
+    SCORE = "score"
 
     # Conversation Metrics
     CONVERSATION_RESOLUTION = "conversation_resolution"
diff --git a/examples/run_custom_eval.ipynb b/examples/run_custom_eval.ipynb
index b30296e..a10e25d 100644
--- a/examples/run_custom_eval.ipynb
+++ b/examples/run_custom_eval.ipynb
@@ -2,20 +2,9 @@
        "cells": [
               {
                      "cell_type": "code",
-                     "execution_count": 1,
+                     "execution_count": 4,
                      "metadata": {},
-                     "outputs": [
-                            {
-                                   "name": "stderr",
-                                   "output_type": "stream",
-                                   "text": [
-                                          "/Users/akshat_g/athina/repos/athina-evals/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n",
-                                          "  warnings.warn(\n",
-                                          "/Users/akshat_g/athina/repos/athina-evals/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-                                          "  from .autonotebook import tqdm as notebook_tqdm\n"
-                                   ]
-                            }
-                     ],
+                     "outputs": [],
                      "source": [
                             "import os\n",
                             "from athina.evals import CustomPrompt\n",
@@ -29,7 +18,7 @@
               },
               {
                      "cell_type": "code",
-                     "execution_count": 2,
+                     "execution_count": 5,
                      "metadata": {},
                      "outputs": [
                             {
@@ -63,7 +52,7 @@
                                                  "    <tr>\n",
                                                  "      <th>0</th>\n",
                                                  "      <td>What is the capital of Greece?</td>\n",
-                                                 "      <td>[Greece is often called the cradle of Western ...</td>\n",
+                                                 "      <td>[Greece is often called the cradle of Western civilization.]</td>\n",
                                                  "      <td>Athens</td>\n",
                                                  "      <td>None</td>\n",
                                                  "    </tr>\n",
@@ -71,14 +60,14 @@
                                                  "      <th>1</th>\n",
                                                  "      <td>What is the price of a Tesla Model 3?</td>\n",
                                                  "      <td>[Tesla Model 3 is a fully electric car.]</td>\n",
-                                                 "      <td>I cannot answer this question as prices vary f...</td>\n",
+                                                 "      <td>I cannot answer this question as prices vary from country to country.</td>\n",
                                                  "      <td>None</td>\n",
                                                  "    </tr>\n",
                                                  "    <tr>\n",
                                                  "      <th>2</th>\n",
                                                  "      <td>What is a shooting star?</td>\n",
-                                                 "      <td>[Black holes are stars that have collapsed und...</td>\n",
-                                                 "      <td>A shooting star is a meteor that burns up in t...</td>\n",
+                                                 "      <td>[Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]</td>\n",
+                                                 "      <td>A shooting star is a meteor that burns up in the atmosphere.</td>\n",
                                                  "      <td>None</td>\n",
                                                  "    </tr>\n",
                                                  "  </tbody>\n",
@@ -91,18 +80,23 @@
                                                  "1  What is the price of a Tesla Model 3?   \n",
                                                  "2               What is a shooting star?   \n",
                                                  "\n",
-                                                 "                                             context  \\\n",
-                                                 "0  [Greece is often called the cradle of Western ...   \n",
-                                                 "1           [Tesla Model 3 is a fully electric car.]   \n",
-                                                 "2  [Black holes are stars that have collapsed und...   \n",
+                                                 "                                                                                                                                                    context  \\\n",
+                                                 "0                                                                                              [Greece is often called the cradle of Western civilization.]   \n",
+                                                 "1                                                                                                                  [Tesla Model 3 is a fully electric car.]   \n",
+                                                 "2  [Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]   \n",
                                                  "\n",
-                                                 "                                            response expected_response  \n",
-                                                 "0                                             Athens              None  \n",
-                                                 "1  I cannot answer this question as prices vary f...              None  \n",
-                                                 "2  A shooting star is a meteor that burns up in t...              None  "
+                                                 "                                                                response  \\\n",
+                                                 "0                                                                 Athens   \n",
+                                                 "1  I cannot answer this question as prices vary from country to country.   \n",
+                                                 "2           A shooting star is a meteor that burns up in the atmosphere.   \n",
+                                                 "\n",
+                                                 "  expected_response  \n",
+                                                 "0              None  \n",
+                                                 "1              None  \n",
+                                                 "2              None  "
                                           ]
                                    },
-                                   "execution_count": 2,
+                                   "execution_count": 5,
                                    "metadata": {},
                                    "output_type": "execute_result"
                             }
@@ -133,9 +127,25 @@
               },
               {
                      "cell_type": "code",
-                     "execution_count": 3,
+                     "execution_count": 7,
                      "metadata": {},
                      "outputs": [
+                            {
+                                   "name": "stdout",
+                                   "output_type": "stream",
+                                   "text": [
+                                          "Error logging eval results to Athina: \n",
+                                          "Invalid `transaction.dataset_eval_result.createMany()` invocation in\n",
+                                          "/Users/akshat_g/athina/repos/athina-api/src/services/evalRun.service.ts:309:47\n",
+                                          "\n",
+                                          "  306   )\n",
+                                          "  307 }\n",
+                                          "  308 \n",
+                                          "→ 309 await transaction.dataset_eval_result.createMany(\n",
+                                          "Foreign key constraint failed on the field: `dataset_eval_result_metric_id_fkey (index)` (Extra Info: No Details)\n",
+                                          "You can view your dataset at: https://app.athina.ai/develop/f0165b6f-975f-4c70-93c9-e881501773f9\n"
+                                   ]
+                            },
                             {
                                    "data": {
                                           "text/html": [
@@ -166,7 +176,7 @@
                                                  "      <th>grade_reason</th>\n",
                                                  "      <th>runtime</th>\n",
                                                  "      <th>model</th>\n",
-                                                 "      <th>passed</th>\n",
+                                                 "      <th>custom_prompt_score</th>\n",
                                                  "    </tr>\n",
                                                  "  </thead>\n",
                                                  "  <tbody>\n",
@@ -177,11 +187,11 @@
                                                  "      <td>Athens</td>\n",
                                                  "      <td>None</td>\n",
                                                  "      <td>Response should answer user's query</td>\n",
-                                                 "      <td>False</td>\n",
-                                                 "      <td>The response provided the correct answer to the user's query.</td>\n",
-                                                 "      <td>1350</td>\n",
+                                                 "      <td>None</td>\n",
+                                                 "      <td>The response provided an answer to the user's query.</td>\n",
+                                                 "      <td>869</td>\n",
                                                  "      <td>gpt-3.5-turbo</td>\n",
-                                                 "      <td>1.0</td>\n",
+                                                 "      <td>1</td>\n",
                                                  "    </tr>\n",
                                                  "    <tr>\n",
                                                  "      <th>1</th>\n",
@@ -190,11 +200,11 @@
                                                  "      <td>I cannot answer this question as prices vary from country to country.</td>\n",
                                                  "      <td>None</td>\n",
                                                  "      <td>Response should answer user's query</td>\n",
-                                                 "      <td>True</td>\n",
-                                                 "      <td>The response refuses to answer the user's query, which does not meet the criteria for a pass.</td>\n",
-                                                 "      <td>1161</td>\n",
+                                                 "      <td>None</td>\n",
+                                                 "      <td>The response provided an answer to the user's query.</td>\n",
+                                                 "      <td>2008</td>\n",
                                                  "      <td>gpt-3.5-turbo</td>\n",
-                                                 "      <td>0.0</td>\n",
+                                                 "      <td>1</td>\n",
                                                  "    </tr>\n",
                                                  "    <tr>\n",
                                                  "      <th>2</th>\n",
@@ -203,11 +213,11 @@
                                                  "      <td>A shooting star is a meteor that burns up in the atmosphere.</td>\n",
                                                  "      <td>None</td>\n",
                                                  "      <td>Response should answer user's query</td>\n",
-                                                 "      <td>False</td>\n",
-                                                 "      <td>The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere.</td>\n",
-                                                 "      <td>1232</td>\n",
+                                                 "      <td>None</td>\n",
+                                                 "      <td>The response provided an answer to the user's query.</td>\n",
+                                                 "      <td>943</td>\n",
                                                  "      <td>gpt-3.5-turbo</td>\n",
-                                                 "      <td>1.0</td>\n",
+                                                 "      <td>1</td>\n",
                                                  "    </tr>\n",
                                                  "  </tbody>\n",
                                                  "</table>\n",
@@ -229,23 +239,23 @@
                                                  "1  I cannot answer this question as prices vary from country to country.   \n",
                                                  "2           A shooting star is a meteor that burns up in the atmosphere.   \n",
                                                  "\n",
-                                                 "  expected_response                         display_name  failed  \\\n",
-                                                 "0              None  Response should answer user's query   False   \n",
-                                                 "1              None  Response should answer user's query    True   \n",
-                                                 "2              None  Response should answer user's query   False   \n",
+                                                 "  expected_response                         display_name failed  \\\n",
+                                                 "0              None  Response should answer user's query   None   \n",
+                                                 "1              None  Response should answer user's query   None   \n",
+                                                 "2              None  Response should answer user's query   None   \n",
                                                  "\n",
-                                                 "                                                                                                                             grade_reason  \\\n",
-                                                 "0                                                                           The response provided the correct answer to the user's query.   \n",
-                                                 "1                                           The response refuses to answer the user's query, which does not meet the criteria for a pass.   \n",
-                                                 "2  The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere.   \n",
+                                                 "                                           grade_reason  runtime  \\\n",
+                                                 "0  The response provided an answer to the user's query.      869   \n",
+                                                 "1  The response provided an answer to the user's query.     2008   \n",
+                                                 "2  The response provided an answer to the user's query.      943   \n",
                                                  "\n",
-                                                 "   runtime          model  passed  \n",
-                                                 "0     1350  gpt-3.5-turbo     1.0  \n",
-                                                 "1     1161  gpt-3.5-turbo     0.0  \n",
-                                                 "2     1232  gpt-3.5-turbo     1.0  "
+                                                 "           model  custom_prompt_score  \n",
+                                                 "0  gpt-3.5-turbo                    1  \n",
+                                                 "1  gpt-3.5-turbo                    1  \n",
+                                                 "2  gpt-3.5-turbo                    1  "
                                           ]
                                    },
-                                   "execution_count": 3,
+                                   "execution_count": 7,
                                    "metadata": {},
                                    "output_type": "execute_result"
                             }
@@ -254,13 +264,14 @@
                             "# Checks if the LLM response answers the user query sufficiently\n",
                             "eval_model = \"gpt-3.5-turbo\"\n",
                             "eval_prompt = \"\"\"\n",
-                            "If the response refuses to answer the user's query, then fail. Otherwise pass.\n",
+                            "Based on the coherence of response, give the score ranging from 1 to 5.\n",
                             "\n",
                             "User Query: {query}\n",
                             "Response: {response}\"\"\"\n",
                             "CustomPrompt(\n",
                             "    eval_prompt=eval_prompt, \n",
                             "    model=eval_model, \n",
+                            "    output_type=\"numeric\",\n",
                             "    display_name=\"Response should answer user's query\",\n",
                             ").run_batch(data=dataset).to_df()"
                      ]

From 702f1c74f0696e0766737c7d54632001913a419d Mon Sep 17 00:00:00 2001
From: Akshat Gupta <akshatwillrock@gmail.com>
Date: Fri, 5 Jul 2024 18:01:32 +0530
Subject: [PATCH 4/4] default value of output type

---
 athina/evals/llm/custom_prompt/evaluator.py |   8 +-
 examples/run_custom_eval.ipynb              | 118 ++++++++++----------
 pyproject.toml                              |   2 +-
 3 files changed, 62 insertions(+), 66 deletions(-)

diff --git a/athina/evals/llm/custom_prompt/evaluator.py b/athina/evals/llm/custom_prompt/evaluator.py
index 440c013..f628fb0 100644
--- a/athina/evals/llm/custom_prompt/evaluator.py
+++ b/athina/evals/llm/custom_prompt/evaluator.py
@@ -27,7 +27,7 @@ class CustomPrompt(LlmEvaluator):
     def __init__(
         self,
         eval_prompt: str,
-        output_type: str = None,
+        output_type: str = 'boolean',
         display_name: str = None,
         metric_ids: List[str] = None,
         model: str = None,
@@ -121,8 +121,6 @@ def _system_message(self) -> str:
                 "- score: The score based on the provided grading criteria.\n"
                 "- explanation: An explanation of the score.\n"
             )
-        else:
-            return super()._system_message()
 
     def _evaluate(self, **kwargs) -> EvalResult:
         """
@@ -148,16 +146,12 @@ def _evaluate(self, **kwargs) -> EvalResult:
             if self._output_type == 'boolean':
                 result = chat_completion_response_json["result"]
                 explanation = chat_completion_response_json["explanation"]
-                print(f"result: {result}")
-                print(f"explanation: {explanation}")
                 failure = self.is_failure(result)
                 passed_value = 1 - float(failure)
                 metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value))
             elif self._output_type == 'numeric':
                 score = chat_completion_response_json["score"]
                 explanation = chat_completion_response_json["explanation"]
-                print(f"score: {score}")
-                print(f"explanation: {explanation}")
                 metrics.append(EvalResultMetric(id=MetricType.SCORE.value, value=score))
                 failure = None  # Numeric evaluations don't have a pass/fail result
 
diff --git a/examples/run_custom_eval.ipynb b/examples/run_custom_eval.ipynb
index a10e25d..fab47b7 100644
--- a/examples/run_custom_eval.ipynb
+++ b/examples/run_custom_eval.ipynb
@@ -2,9 +2,20 @@
        "cells": [
               {
                      "cell_type": "code",
-                     "execution_count": 4,
+                     "execution_count": 1,
                      "metadata": {},
-                     "outputs": [],
+                     "outputs": [
+                            {
+                                   "name": "stderr",
+                                   "output_type": "stream",
+                                   "text": [
+                                          "/Users/akshat_g/athina/repos/athina-evals/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n",
+                                          "  warnings.warn(\n",
+                                          "/Users/akshat_g/athina/repos/athina-evals/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+                                          "  from .autonotebook import tqdm as notebook_tqdm\n"
+                                   ]
+                            }
+                     ],
                      "source": [
                             "import os\n",
                             "from athina.evals import CustomPrompt\n",
@@ -13,12 +24,12 @@
                             "import pandas as pd\n",
                             "\n",
                             "OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))\n",
-                            "AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))"
+                            "# AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))"
                      ]
               },
               {
                      "cell_type": "code",
-                     "execution_count": 5,
+                     "execution_count": 2,
                      "metadata": {},
                      "outputs": [
                             {
@@ -52,7 +63,7 @@
                                                  "    <tr>\n",
                                                  "      <th>0</th>\n",
                                                  "      <td>What is the capital of Greece?</td>\n",
-                                                 "      <td>[Greece is often called the cradle of Western civilization.]</td>\n",
+                                                 "      <td>[Greece is often called the cradle of Western ...</td>\n",
                                                  "      <td>Athens</td>\n",
                                                  "      <td>None</td>\n",
                                                  "    </tr>\n",
@@ -60,14 +71,14 @@
                                                  "      <th>1</th>\n",
                                                  "      <td>What is the price of a Tesla Model 3?</td>\n",
                                                  "      <td>[Tesla Model 3 is a fully electric car.]</td>\n",
-                                                 "      <td>I cannot answer this question as prices vary from country to country.</td>\n",
+                                                 "      <td>I cannot answer this question as prices vary f...</td>\n",
                                                  "      <td>None</td>\n",
                                                  "    </tr>\n",
                                                  "    <tr>\n",
                                                  "      <th>2</th>\n",
                                                  "      <td>What is a shooting star?</td>\n",
-                                                 "      <td>[Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]</td>\n",
-                                                 "      <td>A shooting star is a meteor that burns up in the atmosphere.</td>\n",
+                                                 "      <td>[Black holes are stars that have collapsed und...</td>\n",
+                                                 "      <td>A shooting star is a meteor that burns up in t...</td>\n",
                                                  "      <td>None</td>\n",
                                                  "    </tr>\n",
                                                  "  </tbody>\n",
@@ -80,23 +91,18 @@
                                                  "1  What is the price of a Tesla Model 3?   \n",
                                                  "2               What is a shooting star?   \n",
                                                  "\n",
-                                                 "                                                                                                                                                    context  \\\n",
-                                                 "0                                                                                              [Greece is often called the cradle of Western civilization.]   \n",
-                                                 "1                                                                                                                  [Tesla Model 3 is a fully electric car.]   \n",
-                                                 "2  [Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]   \n",
+                                                 "                                             context  \\\n",
+                                                 "0  [Greece is often called the cradle of Western ...   \n",
+                                                 "1           [Tesla Model 3 is a fully electric car.]   \n",
+                                                 "2  [Black holes are stars that have collapsed und...   \n",
                                                  "\n",
-                                                 "                                                                response  \\\n",
-                                                 "0                                                                 Athens   \n",
-                                                 "1  I cannot answer this question as prices vary from country to country.   \n",
-                                                 "2           A shooting star is a meteor that burns up in the atmosphere.   \n",
-                                                 "\n",
-                                                 "  expected_response  \n",
-                                                 "0              None  \n",
-                                                 "1              None  \n",
-                                                 "2              None  "
+                                                 "                                            response expected_response  \n",
+                                                 "0                                             Athens              None  \n",
+                                                 "1  I cannot answer this question as prices vary f...              None  \n",
+                                                 "2  A shooting star is a meteor that burns up in t...              None  "
                                           ]
                                    },
-                                   "execution_count": 5,
+                                   "execution_count": 2,
                                    "metadata": {},
                                    "output_type": "execute_result"
                             }
@@ -127,23 +133,14 @@
               },
               {
                      "cell_type": "code",
-                     "execution_count": 7,
+                     "execution_count": 3,
                      "metadata": {},
                      "outputs": [
                             {
                                    "name": "stdout",
                                    "output_type": "stream",
                                    "text": [
-                                          "Error logging eval results to Athina: \n",
-                                          "Invalid `transaction.dataset_eval_result.createMany()` invocation in\n",
-                                          "/Users/akshat_g/athina/repos/athina-api/src/services/evalRun.service.ts:309:47\n",
-                                          "\n",
-                                          "  306   )\n",
-                                          "  307 }\n",
-                                          "  308 \n",
-                                          "→ 309 await transaction.dataset_eval_result.createMany(\n",
-                                          "Foreign key constraint failed on the field: `dataset_eval_result_metric_id_fkey (index)` (Extra Info: No Details)\n",
-                                          "You can view your dataset at: https://app.athina.ai/develop/f0165b6f-975f-4c70-93c9-e881501773f9\n"
+                                          "Error logging dataset to Athina: ('Connection aborted.', BadStatusLine('ÿ\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x01\\x7ft\\x01/2.32.3\\r\\n'))\n"
                                    ]
                             },
                             {
@@ -176,7 +173,7 @@
                                                  "      <th>grade_reason</th>\n",
                                                  "      <th>runtime</th>\n",
                                                  "      <th>model</th>\n",
-                                                 "      <th>custom_prompt_score</th>\n",
+                                                 "      <th>score</th>\n",
                                                  "    </tr>\n",
                                                  "  </thead>\n",
                                                  "  <tbody>\n",
@@ -186,10 +183,10 @@
                                                  "      <td>[Greece is often called the cradle of Western civilization.]</td>\n",
                                                  "      <td>Athens</td>\n",
                                                  "      <td>None</td>\n",
-                                                 "      <td>Response should answer user's query</td>\n",
+                                                 "      <td>Response should answer user's query coherently</td>\n",
                                                  "      <td>None</td>\n",
-                                                 "      <td>The response provided an answer to the user's query.</td>\n",
-                                                 "      <td>869</td>\n",
+                                                 "      <td>The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.</td>\n",
+                                                 "      <td>1012</td>\n",
                                                  "      <td>gpt-3.5-turbo</td>\n",
                                                  "      <td>1</td>\n",
                                                  "    </tr>\n",
@@ -199,10 +196,10 @@
                                                  "      <td>[Tesla Model 3 is a fully electric car.]</td>\n",
                                                  "      <td>I cannot answer this question as prices vary from country to country.</td>\n",
                                                  "      <td>None</td>\n",
-                                                 "      <td>Response should answer user's query</td>\n",
+                                                 "      <td>Response should answer user's query coherently</td>\n",
                                                  "      <td>None</td>\n",
-                                                 "      <td>The response provided an answer to the user's query.</td>\n",
-                                                 "      <td>2008</td>\n",
+                                                 "      <td>The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment.</td>\n",
+                                                 "      <td>1136</td>\n",
                                                  "      <td>gpt-3.5-turbo</td>\n",
                                                  "      <td>1</td>\n",
                                                  "    </tr>\n",
@@ -212,10 +209,10 @@
                                                  "      <td>[Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]</td>\n",
                                                  "      <td>A shooting star is a meteor that burns up in the atmosphere.</td>\n",
                                                  "      <td>None</td>\n",
-                                                 "      <td>Response should answer user's query</td>\n",
+                                                 "      <td>Response should answer user's query coherently</td>\n",
                                                  "      <td>None</td>\n",
-                                                 "      <td>The response provided an answer to the user's query.</td>\n",
-                                                 "      <td>943</td>\n",
+                                                 "      <td>The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.</td>\n",
+                                                 "      <td>1074</td>\n",
                                                  "      <td>gpt-3.5-turbo</td>\n",
                                                  "      <td>1</td>\n",
                                                  "    </tr>\n",
@@ -239,23 +236,23 @@
                                                  "1  I cannot answer this question as prices vary from country to country.   \n",
                                                  "2           A shooting star is a meteor that burns up in the atmosphere.   \n",
                                                  "\n",
-                                                 "  expected_response                         display_name failed  \\\n",
-                                                 "0              None  Response should answer user's query   None   \n",
-                                                 "1              None  Response should answer user's query   None   \n",
-                                                 "2              None  Response should answer user's query   None   \n",
+                                                 "  expected_response                                    display_name failed  \\\n",
+                                                 "0              None  Response should answer user's query coherently   None   \n",
+                                                 "1              None  Response should answer user's query coherently   None   \n",
+                                                 "2              None  Response should answer user's query coherently   None   \n",
                                                  "\n",
-                                                 "                                           grade_reason  runtime  \\\n",
-                                                 "0  The response provided an answer to the user's query.      869   \n",
-                                                 "1  The response provided an answer to the user's query.     2008   \n",
-                                                 "2  The response provided an answer to the user's query.      943   \n",
+                                                 "                                                                                                                              grade_reason  \\\n",
+                                                 "0    The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.   \n",
+                                                 "1  The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment.   \n",
+                                                 "2    The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.   \n",
                                                  "\n",
-                                                 "           model  custom_prompt_score  \n",
-                                                 "0  gpt-3.5-turbo                    1  \n",
-                                                 "1  gpt-3.5-turbo                    1  \n",
-                                                 "2  gpt-3.5-turbo                    1  "
+                                                 "   runtime          model  score  \n",
+                                                 "0     1012  gpt-3.5-turbo      1  \n",
+                                                 "1     1136  gpt-3.5-turbo      1  \n",
+                                                 "2     1074  gpt-3.5-turbo      1  "
                                           ]
                                    },
-                                   "execution_count": 7,
+                                   "execution_count": 3,
                                    "metadata": {},
                                    "output_type": "execute_result"
                             }
@@ -268,11 +265,16 @@
                             "\n",
                             "User Query: {query}\n",
                             "Response: {response}\"\"\"\n",
+                            "eval_prompt_1 = \"\"\"\n",
+                            "If response answers the query, then pass otherwise fail.\n",
+                            "\n",
+                            "User Query: {query}\n",
+                            "Response: {response}\"\"\"\n",
                             "CustomPrompt(\n",
                             "    eval_prompt=eval_prompt, \n",
+                            "    output_type='numeric',\n",
                             "    model=eval_model, \n",
-                            "    output_type=\"numeric\",\n",
-                            "    display_name=\"Response should answer user's query\",\n",
+                            "    display_name=\"Response should answer user's query coherently\",\n",
                             ").run_batch(data=dataset).to_df()"
                      ]
               }
diff --git a/pyproject.toml b/pyproject.toml
index f8fc9b2..4e8e55f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "athina"
-version = "1.4.19"
+version = "1.4.25"
 description = "Python SDK to configure and run evaluations for your LLM-based application"
 authors = ["Shiv Sakhuja <shiv@athina.ai>", "Akshat Gupta <akshat@athina.ai>", "Vivek Aditya <vivek@athina.ai>", "Akhil Bisht <akhil@athina.ai>"]
 readme = "README.md"