diff --git a/athina/evals/llm/custom_prompt/evaluator.py b/athina/evals/llm/custom_prompt/evaluator.py index 8efd95b..f628fb0 100644 --- a/athina/evals/llm/custom_prompt/evaluator.py +++ b/athina/evals/llm/custom_prompt/evaluator.py @@ -1,3 +1,5 @@ +import time +from athina.helpers.logger import logger from typing import List, Optional, Dict from jinja2 import Environment from athina.helpers.jinja_helper import PreserveUndefined @@ -6,7 +8,8 @@ from ..llm_evaluator import LlmEvaluator from athina.evals.eval_type import LlmEvalTypeId from ..example import FewShotExample - +from athina.interfaces.result import EvalResult, EvalResultMetric +from athina.metrics.metric_type import MetricType class CustomPrompt(LlmEvaluator): """ @@ -14,6 +17,7 @@ class CustomPrompt(LlmEvaluator): """ _eval_prompt: Optional[str] = None + _output_type: Optional[str] = None _display_name: str = None _metric_ids: List[str] = None _model: str = None @@ -23,6 +27,7 @@ class CustomPrompt(LlmEvaluator): def __init__( self, eval_prompt: str, + output_type: str = 'boolean', display_name: str = None, metric_ids: List[str] = None, model: str = None, @@ -37,6 +42,7 @@ def __init__( raise ValueError("model is not defined") self._eval_prompt = eval_prompt + self._output_type = output_type self._display_name = display_name self._metric_ids = metric_ids self._model = model @@ -95,3 +101,75 @@ def is_failure(self, result) -> Optional[bool]: def _user_message(self, **kwargs) -> str: template = self.env.from_string(self._user_message_template) return template.render(**kwargs) + + def _system_message(self) -> str: + if self._output_type == 'boolean': + return ( + "### INSTRUCTIONS ###\n" + "You are an expert at evaluating responses by an AI.\n" + "Based on the instructions provided, you will evaluate the response and determine if it passes or fails.\n" + "You MUST return a JSON object with the following fields:\n" + "- result: Result must be either 'Pass' or 'Fail'.\n" + "- explanation: An explanation of why the result is Pass or Fail.\n" + ) + elif self._output_type == 'numeric': + return ( + "### INSTRUCTIONS ###\n" + "You are an expert at evaluating responses by an AI.\n" + "Based on the instructions provided, you will evaluate the response and provide a score.\n" + "You MUST return a JSON object with the following fields:\n" + "- score: The score based on the provided grading criteria.\n" + "- explanation: An explanation of the score.\n" + ) + + def _evaluate(self, **kwargs) -> EvalResult: + """ + Run the LLM evaluator. + """ + start_time = time.time() + # Validate that correct args were passed + self.validate_args(**kwargs) + + # Construct Prompt + messages = self._prompt_messages(**kwargs) + + # Run the LLM Completion + + chat_completion_response_json: dict = self.llm_service.json_completion( + model=self._model, + messages=messages, + temperature=self.TEMPERATURE, + ) + + metrics = [] + try: + if self._output_type == 'boolean': + result = chat_completion_response_json["result"] + explanation = chat_completion_response_json["explanation"] + failure = self.is_failure(result) + passed_value = 1 - float(failure) + metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value)) + elif self._output_type == 'numeric': + score = chat_completion_response_json["score"] + explanation = chat_completion_response_json["explanation"] + metrics.append(EvalResultMetric(id=MetricType.SCORE.value, value=score)) + failure = None # Numeric evaluations don't have a pass/fail result + + except Exception as e: + logger.error(f"Error occurred during eval: {e}") + raise e + + end_time = time.time() + eval_runtime_ms = int((end_time - start_time) * 1000) + llm_eval_result = EvalResult( + name=self.name, + display_name=self.display_name, + data=kwargs, + failure=failure, + reason=explanation, + runtime=eval_runtime_ms, + model=self._model, + metrics=metrics, + ) + return {k: v for k, v in llm_eval_result.items() if v is not None} + diff --git a/athina/metrics/metric_type.py b/athina/metrics/metric_type.py index 2a5c026..77106f8 100644 --- a/athina/metrics/metric_type.py +++ b/athina/metrics/metric_type.py @@ -28,6 +28,7 @@ class MetricType(Enum): GROUNDEDNESS = "groundedness" PASSED = "passed" SIMILARITY_SCORE = "similarity_score" + SCORE = "score" # Conversation Metrics CONVERSATION_RESOLUTION = "conversation_resolution" diff --git a/examples/run_custom_eval.ipynb b/examples/run_custom_eval.ipynb index b30296e..fab47b7 100644 --- a/examples/run_custom_eval.ipynb +++ b/examples/run_custom_eval.ipynb @@ -24,7 +24,7 @@ "import pandas as pd\n", "\n", "OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))\n", - "AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))" + "# AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))" ] }, { @@ -136,6 +136,13 @@ "execution_count": 3, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error logging dataset to Athina: ('Connection aborted.', BadStatusLine('ΓΏ\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x01\\x7ft\\x01/2.32.3\\r\\n'))\n" + ] + }, { "data": { "text/html": [ @@ -166,7 +173,7 @@ " grade_reason\n", " runtime\n", " model\n", - " passed\n", + " score\n", " \n", " \n", " \n", @@ -176,12 +183,12 @@ " [Greece is often called the cradle of Western civilization.]\n", " Athens\n", " None\n", - " Response should answer user's query\n", - " False\n", - " The response provided the correct answer to the user's query.\n", - " 1350\n", + " Response should answer user's query coherently\n", + " None\n", + " The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.\n", + " 1012\n", " gpt-3.5-turbo\n", - " 1.0\n", + " 1\n", " \n", " \n", " 1\n", @@ -189,12 +196,12 @@ " [Tesla Model 3 is a fully electric car.]\n", " I cannot answer this question as prices vary from country to country.\n", " None\n", - " Response should answer user's query\n", - " True\n", - " The response refuses to answer the user's query, which does not meet the criteria for a pass.\n", - " 1161\n", + " Response should answer user's query coherently\n", + " None\n", + " The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment.\n", + " 1136\n", " gpt-3.5-turbo\n", - " 0.0\n", + " 1\n", " \n", " \n", " 2\n", @@ -202,12 +209,12 @@ " [Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]\n", " A shooting star is a meteor that burns up in the atmosphere.\n", " None\n", - " Response should answer user's query\n", - " False\n", - " The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere.\n", - " 1232\n", + " Response should answer user's query coherently\n", + " None\n", + " The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.\n", + " 1074\n", " gpt-3.5-turbo\n", - " 1.0\n", + " 1\n", " \n", " \n", "\n", @@ -229,20 +236,20 @@ "1 I cannot answer this question as prices vary from country to country. \n", "2 A shooting star is a meteor that burns up in the atmosphere. \n", "\n", - " expected_response display_name failed \\\n", - "0 None Response should answer user's query False \n", - "1 None Response should answer user's query True \n", - "2 None Response should answer user's query False \n", + " expected_response display_name failed \\\n", + "0 None Response should answer user's query coherently None \n", + "1 None Response should answer user's query coherently None \n", + "2 None Response should answer user's query coherently None \n", "\n", - " grade_reason \\\n", - "0 The response provided the correct answer to the user's query. \n", - "1 The response refuses to answer the user's query, which does not meet the criteria for a pass. \n", - "2 The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere. \n", + " grade_reason \\\n", + "0 The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score. \n", + "1 The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment. \n", + "2 The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score. \n", "\n", - " runtime model passed \n", - "0 1350 gpt-3.5-turbo 1.0 \n", - "1 1161 gpt-3.5-turbo 0.0 \n", - "2 1232 gpt-3.5-turbo 1.0 " + " runtime model score \n", + "0 1012 gpt-3.5-turbo 1 \n", + "1 1136 gpt-3.5-turbo 1 \n", + "2 1074 gpt-3.5-turbo 1 " ] }, "execution_count": 3, @@ -254,14 +261,20 @@ "# Checks if the LLM response answers the user query sufficiently\n", "eval_model = \"gpt-3.5-turbo\"\n", "eval_prompt = \"\"\"\n", - "If the response refuses to answer the user's query, then fail. Otherwise pass.\n", + "Based on the coherence of response, give the score ranging from 1 to 5.\n", + "\n", + "User Query: {query}\n", + "Response: {response}\"\"\"\n", + "eval_prompt_1 = \"\"\"\n", + "If response answers the query, then pass otherwise fail.\n", "\n", "User Query: {query}\n", "Response: {response}\"\"\"\n", "CustomPrompt(\n", " eval_prompt=eval_prompt, \n", + " output_type='numeric',\n", " model=eval_model, \n", - " display_name=\"Response should answer user's query\",\n", + " display_name=\"Response should answer user's query coherently\",\n", ").run_batch(data=dataset).to_df()" ] } diff --git a/pyproject.toml b/pyproject.toml index 1931573..4e8e55f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "athina" -version = "1.4.24" +version = "1.4.25" description = "Python SDK to configure and run evaluations for your LLM-based application" authors = ["Shiv Sakhuja ", "Akshat Gupta ", "Vivek Aditya ", "Akhil Bisht "] readme = "README.md"