From c258d36c956a7d3d5b8fd3c5f13336f2c4c611ff Mon Sep 17 00:00:00 2001 From: Akshat Gupta Date: Tue, 2 Jul 2024 10:19:05 +0530 Subject: [PATCH 1/4] add output type to custom evals --- athina/evals/llm/custom_prompt/evaluator.py | 3 +++ athina/evals/llm/llm_evaluator.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/athina/evals/llm/custom_prompt/evaluator.py b/athina/evals/llm/custom_prompt/evaluator.py index 8efd95b..6dec50c 100644 --- a/athina/evals/llm/custom_prompt/evaluator.py +++ b/athina/evals/llm/custom_prompt/evaluator.py @@ -14,6 +14,7 @@ class CustomPrompt(LlmEvaluator): """ _eval_prompt: Optional[str] = None + _output_type: Optional[str] = None _display_name: str = None _metric_ids: List[str] = None _model: str = None @@ -23,6 +24,7 @@ class CustomPrompt(LlmEvaluator): def __init__( self, eval_prompt: str, + output_type: str = None, display_name: str = None, metric_ids: List[str] = None, model: str = None, @@ -37,6 +39,7 @@ def __init__( raise ValueError("model is not defined") self._eval_prompt = eval_prompt + self._output_type = output_type self._display_name = display_name self._metric_ids = metric_ids self._model = model diff --git a/athina/evals/llm/llm_evaluator.py b/athina/evals/llm/llm_evaluator.py index d430001..ebd168a 100644 --- a/athina/evals/llm/llm_evaluator.py +++ b/athina/evals/llm/llm_evaluator.py @@ -1,4 +1,4 @@ -import traceback +stuimport traceback from abc import ABC, abstractmethod import time from typing import List, Optional From bc020156c8903f496b690b0e3cca7e040c8eef65 Mon Sep 17 00:00:00 2001 From: Akshat Gupta Date: Wed, 3 Jul 2024 10:58:34 +0530 Subject: [PATCH 2/4] add output type logic to custom prompt --- athina/evals/llm/custom_prompt/evaluator.py | 79 ++++++++++++++++++++- athina/evals/llm/llm_evaluator.py | 2 +- athina/metrics/metric_type.py | 1 + 3 files changed, 80 insertions(+), 2 deletions(-) diff --git a/athina/evals/llm/custom_prompt/evaluator.py b/athina/evals/llm/custom_prompt/evaluator.py index 6dec50c..f85ad22 100644 --- a/athina/evals/llm/custom_prompt/evaluator.py +++ b/athina/evals/llm/custom_prompt/evaluator.py @@ -1,3 +1,5 @@ +import time +from athina.helpers.logger import logger from typing import List, Optional, Dict from jinja2 import Environment from athina.helpers.jinja_helper import PreserveUndefined @@ -6,7 +8,8 @@ from ..llm_evaluator import LlmEvaluator from athina.evals.eval_type import LlmEvalTypeId from ..example import FewShotExample - +from athina.interfaces.result import EvalResult, EvalResultMetric +from athina.metrics.metric_type import MetricType class CustomPrompt(LlmEvaluator): """ @@ -98,3 +101,77 @@ def is_failure(self, result) -> Optional[bool]: def _user_message(self, **kwargs) -> str: template = self.env.from_string(self._user_message_template) return template.render(**kwargs) + + def _system_message(self) -> str: + if self._output_type == 'boolean': + return ( + "### INSTRUCTIONS ###\n" + "You are an expert at evaluating responses by an AI.\n" + "Based on the instructions provided, you will evaluate the response and determine if it passes or fails.\n" + "You MUST return a JSON object with the following fields:\n" + "- result: Result must be either 'Pass' or 'Fail'.\n" + "- explanation: An explanation of why the result is Pass or Fail.\n" + ) + elif self._output_type == 'numeric': + return ( + "### INSTRUCTIONS ###\n" + "You are an expert at evaluating responses by an AI.\n" + "Based on the instructions provided, you will evaluate the response and provide a score.\n" + "You MUST return a JSON object with the following fields:\n" + "- score: The score based on the provided grading criteria.\n" + "- explanation: An explanation of the score.\n" + ) + else: + return super()._system_message() + + def _evaluate(self, **kwargs) -> EvalResult: + """ + Run the LLM evaluator. + """ + start_time = time.time() + # Validate that correct args were passed + self.validate_args(**kwargs) + + # Construct Prompt + messages = self._prompt_messages(**kwargs) + + # Run the LLM Completion + + chat_completion_response_json: dict = self.llm_service.json_completion( + model=self._model, + messages=messages, + temperature=self.TEMPERATURE, + ) + + metrics = [] + try: + if self._output_type == 'boolean': + result = chat_completion_response_json["result"] + explanation = chat_completion_response_json["explanation"] + failure = self.is_failure(result) + passed_value = 1 - float(failure) + metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value)) + elif self._output_type == 'numeric': + score = chat_completion_response_json["score"] + explanation = chat_completion_response_json["explanation"] + metrics.append(EvalResultMetric(id=MetricType.CUSTOM_PROMPT_SCORE.value, value=score)) + failure = None # Numeric evaluations don't have a pass/fail result + + except Exception as e: + logger.error(f"Error occurred during eval: {e}") + raise e + + end_time = time.time() + eval_runtime_ms = int((end_time - start_time) * 1000) + llm_eval_result = EvalResult( + name=self.name, + display_name=self.display_name, + data=kwargs, + failure=failure, + reason=explanation, + runtime=eval_runtime_ms, + model=self._model, + metrics=metrics, + ) + return {k: v for k, v in llm_eval_result.items() if v is not None} + diff --git a/athina/evals/llm/llm_evaluator.py b/athina/evals/llm/llm_evaluator.py index ebd168a..d430001 100644 --- a/athina/evals/llm/llm_evaluator.py +++ b/athina/evals/llm/llm_evaluator.py @@ -1,4 +1,4 @@ -stuimport traceback +import traceback from abc import ABC, abstractmethod import time from typing import List, Optional diff --git a/athina/metrics/metric_type.py b/athina/metrics/metric_type.py index 2a5c026..f1599e6 100644 --- a/athina/metrics/metric_type.py +++ b/athina/metrics/metric_type.py @@ -28,6 +28,7 @@ class MetricType(Enum): GROUNDEDNESS = "groundedness" PASSED = "passed" SIMILARITY_SCORE = "similarity_score" + CUSTOM_PROMPT_SCORE = "custom_prompt_score" # Conversation Metrics CONVERSATION_RESOLUTION = "conversation_resolution" From cc365a40cea202ab0ef0de0b0b7417dfa3a24856 Mon Sep 17 00:00:00 2001 From: Akshat Gupta Date: Wed, 3 Jul 2024 12:55:51 +0530 Subject: [PATCH 3/4] add output type logic to custom prompt --- athina/evals/llm/custom_prompt/evaluator.py | 6 +- athina/metrics/metric_type.py | 2 +- examples/run_custom_eval.ipynb | 121 +++++++++++--------- 3 files changed, 72 insertions(+), 57 deletions(-) diff --git a/athina/evals/llm/custom_prompt/evaluator.py b/athina/evals/llm/custom_prompt/evaluator.py index f85ad22..440c013 100644 --- a/athina/evals/llm/custom_prompt/evaluator.py +++ b/athina/evals/llm/custom_prompt/evaluator.py @@ -148,13 +148,17 @@ def _evaluate(self, **kwargs) -> EvalResult: if self._output_type == 'boolean': result = chat_completion_response_json["result"] explanation = chat_completion_response_json["explanation"] + print(f"result: {result}") + print(f"explanation: {explanation}") failure = self.is_failure(result) passed_value = 1 - float(failure) metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value)) elif self._output_type == 'numeric': score = chat_completion_response_json["score"] explanation = chat_completion_response_json["explanation"] - metrics.append(EvalResultMetric(id=MetricType.CUSTOM_PROMPT_SCORE.value, value=score)) + print(f"score: {score}") + print(f"explanation: {explanation}") + metrics.append(EvalResultMetric(id=MetricType.SCORE.value, value=score)) failure = None # Numeric evaluations don't have a pass/fail result except Exception as e: diff --git a/athina/metrics/metric_type.py b/athina/metrics/metric_type.py index f1599e6..77106f8 100644 --- a/athina/metrics/metric_type.py +++ b/athina/metrics/metric_type.py @@ -28,7 +28,7 @@ class MetricType(Enum): GROUNDEDNESS = "groundedness" PASSED = "passed" SIMILARITY_SCORE = "similarity_score" - CUSTOM_PROMPT_SCORE = "custom_prompt_score" + SCORE = "score" # Conversation Metrics CONVERSATION_RESOLUTION = "conversation_resolution" diff --git a/examples/run_custom_eval.ipynb b/examples/run_custom_eval.ipynb index b30296e..a10e25d 100644 --- a/examples/run_custom_eval.ipynb +++ b/examples/run_custom_eval.ipynb @@ -2,20 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/akshat_g/athina/repos/athina-evals/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", - " warnings.warn(\n", - "/Users/akshat_g/athina/repos/athina-evals/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "from athina.evals import CustomPrompt\n", @@ -29,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -63,7 +52,7 @@ " \n", " 0\n", " What is the capital of Greece?\n", - " [Greece is often called the cradle of Western ...\n", + " [Greece is often called the cradle of Western civilization.]\n", " Athens\n", " None\n", " \n", @@ -71,14 +60,14 @@ " 1\n", " What is the price of a Tesla Model 3?\n", " [Tesla Model 3 is a fully electric car.]\n", - " I cannot answer this question as prices vary f...\n", + " I cannot answer this question as prices vary from country to country.\n", " None\n", " \n", " \n", " 2\n", " What is a shooting star?\n", - " [Black holes are stars that have collapsed und...\n", - " A shooting star is a meteor that burns up in t...\n", + " [Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]\n", + " A shooting star is a meteor that burns up in the atmosphere.\n", " None\n", " \n", " \n", @@ -91,18 +80,23 @@ "1 What is the price of a Tesla Model 3? \n", "2 What is a shooting star? \n", "\n", - " context \\\n", - "0 [Greece is often called the cradle of Western ... \n", - "1 [Tesla Model 3 is a fully electric car.] \n", - "2 [Black holes are stars that have collapsed und... \n", + " context \\\n", + "0 [Greece is often called the cradle of Western civilization.] \n", + "1 [Tesla Model 3 is a fully electric car.] \n", + "2 [Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.] \n", "\n", - " response expected_response \n", - "0 Athens None \n", - "1 I cannot answer this question as prices vary f... None \n", - "2 A shooting star is a meteor that burns up in t... None " + " response \\\n", + "0 Athens \n", + "1 I cannot answer this question as prices vary from country to country. \n", + "2 A shooting star is a meteor that burns up in the atmosphere. \n", + "\n", + " expected_response \n", + "0 None \n", + "1 None \n", + "2 None " ] }, - "execution_count": 2, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -133,9 +127,25 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error logging eval results to Athina: \n", + "Invalid `transaction.dataset_eval_result.createMany()` invocation in\n", + "/Users/akshat_g/athina/repos/athina-api/src/services/evalRun.service.ts:309:47\n", + "\n", + " 306 )\n", + " 307 }\n", + " 308 \n", + "→ 309 await transaction.dataset_eval_result.createMany(\n", + "Foreign key constraint failed on the field: `dataset_eval_result_metric_id_fkey (index)` (Extra Info: No Details)\n", + "You can view your dataset at: https://app.athina.ai/develop/f0165b6f-975f-4c70-93c9-e881501773f9\n" + ] + }, { "data": { "text/html": [ @@ -166,7 +176,7 @@ " grade_reason\n", " runtime\n", " model\n", - " passed\n", + " custom_prompt_score\n", " \n", " \n", " \n", @@ -177,11 +187,11 @@ " Athens\n", " None\n", " Response should answer user's query\n", - " False\n", - " The response provided the correct answer to the user's query.\n", - " 1350\n", + " None\n", + " The response provided an answer to the user's query.\n", + " 869\n", " gpt-3.5-turbo\n", - " 1.0\n", + " 1\n", " \n", " \n", " 1\n", @@ -190,11 +200,11 @@ " I cannot answer this question as prices vary from country to country.\n", " None\n", " Response should answer user's query\n", - " True\n", - " The response refuses to answer the user's query, which does not meet the criteria for a pass.\n", - " 1161\n", + " None\n", + " The response provided an answer to the user's query.\n", + " 2008\n", " gpt-3.5-turbo\n", - " 0.0\n", + " 1\n", " \n", " \n", " 2\n", @@ -203,11 +213,11 @@ " A shooting star is a meteor that burns up in the atmosphere.\n", " None\n", " Response should answer user's query\n", - " False\n", - " The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere.\n", - " 1232\n", + " None\n", + " The response provided an answer to the user's query.\n", + " 943\n", " gpt-3.5-turbo\n", - " 1.0\n", + " 1\n", " \n", " \n", "\n", @@ -229,23 +239,23 @@ "1 I cannot answer this question as prices vary from country to country. \n", "2 A shooting star is a meteor that burns up in the atmosphere. \n", "\n", - " expected_response display_name failed \\\n", - "0 None Response should answer user's query False \n", - "1 None Response should answer user's query True \n", - "2 None Response should answer user's query False \n", + " expected_response display_name failed \\\n", + "0 None Response should answer user's query None \n", + "1 None Response should answer user's query None \n", + "2 None Response should answer user's query None \n", "\n", - " grade_reason \\\n", - "0 The response provided the correct answer to the user's query. \n", - "1 The response refuses to answer the user's query, which does not meet the criteria for a pass. \n", - "2 The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere. \n", + " grade_reason runtime \\\n", + "0 The response provided an answer to the user's query. 869 \n", + "1 The response provided an answer to the user's query. 2008 \n", + "2 The response provided an answer to the user's query. 943 \n", "\n", - " runtime model passed \n", - "0 1350 gpt-3.5-turbo 1.0 \n", - "1 1161 gpt-3.5-turbo 0.0 \n", - "2 1232 gpt-3.5-turbo 1.0 " + " model custom_prompt_score \n", + "0 gpt-3.5-turbo 1 \n", + "1 gpt-3.5-turbo 1 \n", + "2 gpt-3.5-turbo 1 " ] }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -254,13 +264,14 @@ "# Checks if the LLM response answers the user query sufficiently\n", "eval_model = \"gpt-3.5-turbo\"\n", "eval_prompt = \"\"\"\n", - "If the response refuses to answer the user's query, then fail. Otherwise pass.\n", + "Based on the coherence of response, give the score ranging from 1 to 5.\n", "\n", "User Query: {query}\n", "Response: {response}\"\"\"\n", "CustomPrompt(\n", " eval_prompt=eval_prompt, \n", " model=eval_model, \n", + " output_type=\"numeric\",\n", " display_name=\"Response should answer user's query\",\n", ").run_batch(data=dataset).to_df()" ] From 702f1c74f0696e0766737c7d54632001913a419d Mon Sep 17 00:00:00 2001 From: Akshat Gupta Date: Fri, 5 Jul 2024 18:01:32 +0530 Subject: [PATCH 4/4] default value of output type --- athina/evals/llm/custom_prompt/evaluator.py | 8 +- examples/run_custom_eval.ipynb | 118 ++++++++++---------- pyproject.toml | 2 +- 3 files changed, 62 insertions(+), 66 deletions(-) diff --git a/athina/evals/llm/custom_prompt/evaluator.py b/athina/evals/llm/custom_prompt/evaluator.py index 440c013..f628fb0 100644 --- a/athina/evals/llm/custom_prompt/evaluator.py +++ b/athina/evals/llm/custom_prompt/evaluator.py @@ -27,7 +27,7 @@ class CustomPrompt(LlmEvaluator): def __init__( self, eval_prompt: str, - output_type: str = None, + output_type: str = 'boolean', display_name: str = None, metric_ids: List[str] = None, model: str = None, @@ -121,8 +121,6 @@ def _system_message(self) -> str: "- score: The score based on the provided grading criteria.\n" "- explanation: An explanation of the score.\n" ) - else: - return super()._system_message() def _evaluate(self, **kwargs) -> EvalResult: """ @@ -148,16 +146,12 @@ def _evaluate(self, **kwargs) -> EvalResult: if self._output_type == 'boolean': result = chat_completion_response_json["result"] explanation = chat_completion_response_json["explanation"] - print(f"result: {result}") - print(f"explanation: {explanation}") failure = self.is_failure(result) passed_value = 1 - float(failure) metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value)) elif self._output_type == 'numeric': score = chat_completion_response_json["score"] explanation = chat_completion_response_json["explanation"] - print(f"score: {score}") - print(f"explanation: {explanation}") metrics.append(EvalResultMetric(id=MetricType.SCORE.value, value=score)) failure = None # Numeric evaluations don't have a pass/fail result diff --git a/examples/run_custom_eval.ipynb b/examples/run_custom_eval.ipynb index a10e25d..fab47b7 100644 --- a/examples/run_custom_eval.ipynb +++ b/examples/run_custom_eval.ipynb @@ -2,9 +2,20 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/akshat_g/athina/repos/athina-evals/.venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", + " warnings.warn(\n", + "/Users/akshat_g/athina/repos/athina-evals/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "import os\n", "from athina.evals import CustomPrompt\n", @@ -13,12 +24,12 @@ "import pandas as pd\n", "\n", "OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))\n", - "AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))" + "# AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -52,7 +63,7 @@ " \n", " 0\n", " What is the capital of Greece?\n", - " [Greece is often called the cradle of Western civilization.]\n", + " [Greece is often called the cradle of Western ...\n", " Athens\n", " None\n", " \n", @@ -60,14 +71,14 @@ " 1\n", " What is the price of a Tesla Model 3?\n", " [Tesla Model 3 is a fully electric car.]\n", - " I cannot answer this question as prices vary from country to country.\n", + " I cannot answer this question as prices vary f...\n", " None\n", " \n", " \n", " 2\n", " What is a shooting star?\n", - " [Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]\n", - " A shooting star is a meteor that burns up in the atmosphere.\n", + " [Black holes are stars that have collapsed und...\n", + " A shooting star is a meteor that burns up in t...\n", " None\n", " \n", " \n", @@ -80,23 +91,18 @@ "1 What is the price of a Tesla Model 3? \n", "2 What is a shooting star? \n", "\n", - " context \\\n", - "0 [Greece is often called the cradle of Western civilization.] \n", - "1 [Tesla Model 3 is a fully electric car.] \n", - "2 [Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.] \n", + " context \\\n", + "0 [Greece is often called the cradle of Western ... \n", + "1 [Tesla Model 3 is a fully electric car.] \n", + "2 [Black holes are stars that have collapsed und... \n", "\n", - " response \\\n", - "0 Athens \n", - "1 I cannot answer this question as prices vary from country to country. \n", - "2 A shooting star is a meteor that burns up in the atmosphere. \n", - "\n", - " expected_response \n", - "0 None \n", - "1 None \n", - "2 None " + " response expected_response \n", + "0 Athens None \n", + "1 I cannot answer this question as prices vary f... None \n", + "2 A shooting star is a meteor that burns up in t... None " ] }, - "execution_count": 5, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -127,23 +133,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Error logging eval results to Athina: \n", - "Invalid `transaction.dataset_eval_result.createMany()` invocation in\n", - "/Users/akshat_g/athina/repos/athina-api/src/services/evalRun.service.ts:309:47\n", - "\n", - " 306 )\n", - " 307 }\n", - " 308 \n", - "→ 309 await transaction.dataset_eval_result.createMany(\n", - "Foreign key constraint failed on the field: `dataset_eval_result_metric_id_fkey (index)` (Extra Info: No Details)\n", - "You can view your dataset at: https://app.athina.ai/develop/f0165b6f-975f-4c70-93c9-e881501773f9\n" + "Error logging dataset to Athina: ('Connection aborted.', BadStatusLine('ÿ\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x01\\x7ft\\x01/2.32.3\\r\\n'))\n" ] }, { @@ -176,7 +173,7 @@ " grade_reason\n", " runtime\n", " model\n", - " custom_prompt_score\n", + " score\n", " \n", " \n", " \n", @@ -186,10 +183,10 @@ " [Greece is often called the cradle of Western civilization.]\n", " Athens\n", " None\n", - " Response should answer user's query\n", + " Response should answer user's query coherently\n", " None\n", - " The response provided an answer to the user's query.\n", - " 869\n", + " The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.\n", + " 1012\n", " gpt-3.5-turbo\n", " 1\n", " \n", @@ -199,10 +196,10 @@ " [Tesla Model 3 is a fully electric car.]\n", " I cannot answer this question as prices vary from country to country.\n", " None\n", - " Response should answer user's query\n", + " Response should answer user's query coherently\n", " None\n", - " The response provided an answer to the user's query.\n", - " 2008\n", + " The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment.\n", + " 1136\n", " gpt-3.5-turbo\n", " 1\n", " \n", @@ -212,10 +209,10 @@ " [Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]\n", " A shooting star is a meteor that burns up in the atmosphere.\n", " None\n", - " Response should answer user's query\n", + " Response should answer user's query coherently\n", " None\n", - " The response provided an answer to the user's query.\n", - " 943\n", + " The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.\n", + " 1074\n", " gpt-3.5-turbo\n", " 1\n", " \n", @@ -239,23 +236,23 @@ "1 I cannot answer this question as prices vary from country to country. \n", "2 A shooting star is a meteor that burns up in the atmosphere. \n", "\n", - " expected_response display_name failed \\\n", - "0 None Response should answer user's query None \n", - "1 None Response should answer user's query None \n", - "2 None Response should answer user's query None \n", + " expected_response display_name failed \\\n", + "0 None Response should answer user's query coherently None \n", + "1 None Response should answer user's query coherently None \n", + "2 None Response should answer user's query coherently None \n", "\n", - " grade_reason runtime \\\n", - "0 The response provided an answer to the user's query. 869 \n", - "1 The response provided an answer to the user's query. 2008 \n", - "2 The response provided an answer to the user's query. 943 \n", + " grade_reason \\\n", + "0 The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score. \n", + "1 The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment. \n", + "2 The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score. \n", "\n", - " model custom_prompt_score \n", - "0 gpt-3.5-turbo 1 \n", - "1 gpt-3.5-turbo 1 \n", - "2 gpt-3.5-turbo 1 " + " runtime model score \n", + "0 1012 gpt-3.5-turbo 1 \n", + "1 1136 gpt-3.5-turbo 1 \n", + "2 1074 gpt-3.5-turbo 1 " ] }, - "execution_count": 7, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -268,11 +265,16 @@ "\n", "User Query: {query}\n", "Response: {response}\"\"\"\n", + "eval_prompt_1 = \"\"\"\n", + "If response answers the query, then pass otherwise fail.\n", + "\n", + "User Query: {query}\n", + "Response: {response}\"\"\"\n", "CustomPrompt(\n", " eval_prompt=eval_prompt, \n", + " output_type='numeric',\n", " model=eval_model, \n", - " output_type=\"numeric\",\n", - " display_name=\"Response should answer user's query\",\n", + " display_name=\"Response should answer user's query coherently\",\n", ").run_batch(data=dataset).to_df()" ] } diff --git a/pyproject.toml b/pyproject.toml index f8fc9b2..4e8e55f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "athina" -version = "1.4.19" +version = "1.4.25" description = "Python SDK to configure and run evaluations for your LLM-based application" authors = ["Shiv Sakhuja ", "Akshat Gupta ", "Vivek Aditya ", "Akhil Bisht "] readme = "README.md"