Skip to content

Commit

Permalink
Merge pull request #82 from athina-ai/feature/non-boolean-custom-evals
Browse files Browse the repository at this point in the history
non boolean custom evals
  • Loading branch information
akshat-g authored Jul 5, 2024
2 parents 2d4c45d + 3ecdf25 commit 10780b1
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 33 deletions.
80 changes: 79 additions & 1 deletion athina/evals/llm/custom_prompt/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time
from athina.helpers.logger import logger
from typing import List, Optional, Dict
from jinja2 import Environment
from athina.helpers.jinja_helper import PreserveUndefined
Expand All @@ -6,14 +8,16 @@
from ..llm_evaluator import LlmEvaluator
from athina.evals.eval_type import LlmEvalTypeId
from ..example import FewShotExample

from athina.interfaces.result import EvalResult, EvalResultMetric
from athina.metrics.metric_type import MetricType

class CustomPrompt(LlmEvaluator):
"""
This evaluator can be configured with custom examples and instructions.
"""

_eval_prompt: Optional[str] = None
_output_type: Optional[str] = None
_display_name: str = None
_metric_ids: List[str] = None
_model: str = None
Expand All @@ -23,6 +27,7 @@ class CustomPrompt(LlmEvaluator):
def __init__(
self,
eval_prompt: str,
output_type: str = 'boolean',
display_name: str = None,
metric_ids: List[str] = None,
model: str = None,
Expand All @@ -37,6 +42,7 @@ def __init__(
raise ValueError("model is not defined")

self._eval_prompt = eval_prompt
self._output_type = output_type
self._display_name = display_name
self._metric_ids = metric_ids
self._model = model
Expand Down Expand Up @@ -95,3 +101,75 @@ def is_failure(self, result) -> Optional[bool]:
def _user_message(self, **kwargs) -> str:
template = self.env.from_string(self._user_message_template)
return template.render(**kwargs)

def _system_message(self) -> str:
if self._output_type == 'boolean':
return (
"### INSTRUCTIONS ###\n"
"You are an expert at evaluating responses by an AI.\n"
"Based on the instructions provided, you will evaluate the response and determine if it passes or fails.\n"
"You MUST return a JSON object with the following fields:\n"
"- result: Result must be either 'Pass' or 'Fail'.\n"
"- explanation: An explanation of why the result is Pass or Fail.\n"
)
elif self._output_type == 'numeric':
return (
"### INSTRUCTIONS ###\n"
"You are an expert at evaluating responses by an AI.\n"
"Based on the instructions provided, you will evaluate the response and provide a score.\n"
"You MUST return a JSON object with the following fields:\n"
"- score: The score based on the provided grading criteria.\n"
"- explanation: An explanation of the score.\n"
)

def _evaluate(self, **kwargs) -> EvalResult:
"""
Run the LLM evaluator.
"""
start_time = time.time()
# Validate that correct args were passed
self.validate_args(**kwargs)

# Construct Prompt
messages = self._prompt_messages(**kwargs)

# Run the LLM Completion

chat_completion_response_json: dict = self.llm_service.json_completion(
model=self._model,
messages=messages,
temperature=self.TEMPERATURE,
)

metrics = []
try:
if self._output_type == 'boolean':
result = chat_completion_response_json["result"]
explanation = chat_completion_response_json["explanation"]
failure = self.is_failure(result)
passed_value = 1 - float(failure)
metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value))
elif self._output_type == 'numeric':
score = chat_completion_response_json["score"]
explanation = chat_completion_response_json["explanation"]
metrics.append(EvalResultMetric(id=MetricType.SCORE.value, value=score))
failure = None # Numeric evaluations don't have a pass/fail result

except Exception as e:
logger.error(f"Error occurred during eval: {e}")
raise e

end_time = time.time()
eval_runtime_ms = int((end_time - start_time) * 1000)
llm_eval_result = EvalResult(
name=self.name,
display_name=self.display_name,
data=kwargs,
failure=failure,
reason=explanation,
runtime=eval_runtime_ms,
model=self._model,
metrics=metrics,
)
return {k: v for k, v in llm_eval_result.items() if v is not None}

1 change: 1 addition & 0 deletions athina/metrics/metric_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class MetricType(Enum):
GROUNDEDNESS = "groundedness"
PASSED = "passed"
SIMILARITY_SCORE = "similarity_score"
SCORE = "score"

# Conversation Metrics
CONVERSATION_RESOLUTION = "conversation_resolution"
Expand Down
75 changes: 44 additions & 31 deletions examples/run_custom_eval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"import pandas as pd\n",
"\n",
"OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))\n",
"AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))"
"# AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))"
]
},
{
Expand Down Expand Up @@ -136,6 +136,13 @@
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error logging dataset to Athina: ('Connection aborted.', BadStatusLine('ÿ\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x01\\x7ft\\x01/2.32.3\\r\\n'))\n"
]
},
{
"data": {
"text/html": [
Expand Down Expand Up @@ -166,7 +173,7 @@
" <th>grade_reason</th>\n",
" <th>runtime</th>\n",
" <th>model</th>\n",
" <th>passed</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
Expand All @@ -176,38 +183,38 @@
" <td>[Greece is often called the cradle of Western civilization.]</td>\n",
" <td>Athens</td>\n",
" <td>None</td>\n",
" <td>Response should answer user's query</td>\n",
" <td>False</td>\n",
" <td>The response provided the correct answer to the user's query.</td>\n",
" <td>1350</td>\n",
" <td>Response should answer user's query coherently</td>\n",
" <td>None</td>\n",
" <td>The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.</td>\n",
" <td>1012</td>\n",
" <td>gpt-3.5-turbo</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What is the price of a Tesla Model 3?</td>\n",
" <td>[Tesla Model 3 is a fully electric car.]</td>\n",
" <td>I cannot answer this question as prices vary from country to country.</td>\n",
" <td>None</td>\n",
" <td>Response should answer user's query</td>\n",
" <td>True</td>\n",
" <td>The response refuses to answer the user's query, which does not meet the criteria for a pass.</td>\n",
" <td>1161</td>\n",
" <td>Response should answer user's query coherently</td>\n",
" <td>None</td>\n",
" <td>The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment.</td>\n",
" <td>1136</td>\n",
" <td>gpt-3.5-turbo</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What is a shooting star?</td>\n",
" <td>[Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]</td>\n",
" <td>A shooting star is a meteor that burns up in the atmosphere.</td>\n",
" <td>None</td>\n",
" <td>Response should answer user's query</td>\n",
" <td>False</td>\n",
" <td>The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere.</td>\n",
" <td>1232</td>\n",
" <td>Response should answer user's query coherently</td>\n",
" <td>None</td>\n",
" <td>The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.</td>\n",
" <td>1074</td>\n",
" <td>gpt-3.5-turbo</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -229,20 +236,20 @@
"1 I cannot answer this question as prices vary from country to country. \n",
"2 A shooting star is a meteor that burns up in the atmosphere. \n",
"\n",
" expected_response display_name failed \\\n",
"0 None Response should answer user's query False \n",
"1 None Response should answer user's query True \n",
"2 None Response should answer user's query False \n",
" expected_response display_name failed \\\n",
"0 None Response should answer user's query coherently None \n",
"1 None Response should answer user's query coherently None \n",
"2 None Response should answer user's query coherently None \n",
"\n",
" grade_reason \\\n",
"0 The response provided the correct answer to the user's query. \n",
"1 The response refuses to answer the user's query, which does not meet the criteria for a pass. \n",
"2 The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere. \n",
" grade_reason \\\n",
"0 The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score. \n",
"1 The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment. \n",
"2 The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score. \n",
"\n",
" runtime model passed \n",
"0 1350 gpt-3.5-turbo 1.0 \n",
"1 1161 gpt-3.5-turbo 0.0 \n",
"2 1232 gpt-3.5-turbo 1.0 "
" runtime model score \n",
"0 1012 gpt-3.5-turbo 1 \n",
"1 1136 gpt-3.5-turbo 1 \n",
"2 1074 gpt-3.5-turbo 1 "
]
},
"execution_count": 3,
Expand All @@ -254,14 +261,20 @@
"# Checks if the LLM response answers the user query sufficiently\n",
"eval_model = \"gpt-3.5-turbo\"\n",
"eval_prompt = \"\"\"\n",
"If the response refuses to answer the user's query, then fail. Otherwise pass.\n",
"Based on the coherence of response, give the score ranging from 1 to 5.\n",
"\n",
"User Query: {query}\n",
"Response: {response}\"\"\"\n",
"eval_prompt_1 = \"\"\"\n",
"If response answers the query, then pass otherwise fail.\n",
"\n",
"User Query: {query}\n",
"Response: {response}\"\"\"\n",
"CustomPrompt(\n",
" eval_prompt=eval_prompt, \n",
" output_type='numeric',\n",
" model=eval_model, \n",
" display_name=\"Response should answer user's query\",\n",
" display_name=\"Response should answer user's query coherently\",\n",
").run_batch(data=dataset).to_df()"
]
}
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "athina"
version = "1.4.24"
version = "1.4.25"
description = "Python SDK to configure and run evaluations for your LLM-based application"
authors = ["Shiv Sakhuja <[email protected]>", "Akshat Gupta <[email protected]>", "Vivek Aditya <[email protected]>", "Akhil Bisht <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit 10780b1

Please sign in to comment.