Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

non boolean custom evals #82

Merged
merged 5 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 79 additions & 1 deletion athina/evals/llm/custom_prompt/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time
from athina.helpers.logger import logger
from typing import List, Optional, Dict
from jinja2 import Environment
from athina.helpers.jinja_helper import PreserveUndefined
Expand All @@ -6,14 +8,16 @@
from ..llm_evaluator import LlmEvaluator
from athina.evals.eval_type import LlmEvalTypeId
from ..example import FewShotExample

from athina.interfaces.result import EvalResult, EvalResultMetric
from athina.metrics.metric_type import MetricType

class CustomPrompt(LlmEvaluator):
"""
This evaluator can be configured with custom examples and instructions.
"""

_eval_prompt: Optional[str] = None
_output_type: Optional[str] = None
_display_name: str = None
_metric_ids: List[str] = None
_model: str = None
Expand All @@ -23,6 +27,7 @@ class CustomPrompt(LlmEvaluator):
def __init__(
self,
eval_prompt: str,
output_type: str = 'boolean',
display_name: str = None,
metric_ids: List[str] = None,
model: str = None,
Expand All @@ -37,6 +42,7 @@ def __init__(
raise ValueError("model is not defined")

self._eval_prompt = eval_prompt
self._output_type = output_type
self._display_name = display_name
self._metric_ids = metric_ids
self._model = model
Expand Down Expand Up @@ -95,3 +101,75 @@ def is_failure(self, result) -> Optional[bool]:
def _user_message(self, **kwargs) -> str:
template = self.env.from_string(self._user_message_template)
return template.render(**kwargs)

def _system_message(self) -> str:
if self._output_type == 'boolean':
return (
"### INSTRUCTIONS ###\n"
"You are an expert at evaluating responses by an AI.\n"
"Based on the instructions provided, you will evaluate the response and determine if it passes or fails.\n"
"You MUST return a JSON object with the following fields:\n"
"- result: Result must be either 'Pass' or 'Fail'.\n"
"- explanation: An explanation of why the result is Pass or Fail.\n"
)
elif self._output_type == 'numeric':
return (
"### INSTRUCTIONS ###\n"
"You are an expert at evaluating responses by an AI.\n"
"Based on the instructions provided, you will evaluate the response and provide a score.\n"
"You MUST return a JSON object with the following fields:\n"
"- score: The score based on the provided grading criteria.\n"
"- explanation: An explanation of the score.\n"
)

def _evaluate(self, **kwargs) -> EvalResult:
"""
Run the LLM evaluator.
"""
start_time = time.time()
# Validate that correct args were passed
self.validate_args(**kwargs)

# Construct Prompt
messages = self._prompt_messages(**kwargs)

# Run the LLM Completion

chat_completion_response_json: dict = self.llm_service.json_completion(
model=self._model,
messages=messages,
temperature=self.TEMPERATURE,
)

metrics = []
try:
if self._output_type == 'boolean':
result = chat_completion_response_json["result"]
explanation = chat_completion_response_json["explanation"]
failure = self.is_failure(result)
passed_value = 1 - float(failure)
metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value))
elif self._output_type == 'numeric':
score = chat_completion_response_json["score"]
explanation = chat_completion_response_json["explanation"]
metrics.append(EvalResultMetric(id=MetricType.SCORE.value, value=score))
failure = None # Numeric evaluations don't have a pass/fail result

except Exception as e:
logger.error(f"Error occurred during eval: {e}")
raise e

end_time = time.time()
eval_runtime_ms = int((end_time - start_time) * 1000)
llm_eval_result = EvalResult(
name=self.name,
display_name=self.display_name,
data=kwargs,
failure=failure,
reason=explanation,
runtime=eval_runtime_ms,
model=self._model,
metrics=metrics,
)
return {k: v for k, v in llm_eval_result.items() if v is not None}

1 change: 1 addition & 0 deletions athina/metrics/metric_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class MetricType(Enum):
GROUNDEDNESS = "groundedness"
PASSED = "passed"
SIMILARITY_SCORE = "similarity_score"
SCORE = "score"

# Conversation Metrics
CONVERSATION_RESOLUTION = "conversation_resolution"
Expand Down
75 changes: 44 additions & 31 deletions examples/run_custom_eval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"import pandas as pd\n",
"\n",
"OpenAiApiKey.set_key(os.getenv('OPENAI_API_KEY'))\n",
"AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))"
"# AthinaApiKey.set_key(os.getenv('ATHINA_API_KEY'))"
]
},
{
Expand Down Expand Up @@ -136,6 +136,13 @@
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error logging dataset to Athina: ('Connection aborted.', BadStatusLine('ÿ\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x01\\x7ft\\x01/2.32.3\\r\\n'))\n"
]
},
{
"data": {
"text/html": [
Expand Down Expand Up @@ -166,7 +173,7 @@
" <th>grade_reason</th>\n",
" <th>runtime</th>\n",
" <th>model</th>\n",
" <th>passed</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
Expand All @@ -176,38 +183,38 @@
" <td>[Greece is often called the cradle of Western civilization.]</td>\n",
" <td>Athens</td>\n",
" <td>None</td>\n",
" <td>Response should answer user's query</td>\n",
" <td>False</td>\n",
" <td>The response provided the correct answer to the user's query.</td>\n",
" <td>1350</td>\n",
" <td>Response should answer user's query coherently</td>\n",
" <td>None</td>\n",
" <td>The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.</td>\n",
" <td>1012</td>\n",
" <td>gpt-3.5-turbo</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>What is the price of a Tesla Model 3?</td>\n",
" <td>[Tesla Model 3 is a fully electric car.]</td>\n",
" <td>I cannot answer this question as prices vary from country to country.</td>\n",
" <td>None</td>\n",
" <td>Response should answer user's query</td>\n",
" <td>True</td>\n",
" <td>The response refuses to answer the user's query, which does not meet the criteria for a pass.</td>\n",
" <td>1161</td>\n",
" <td>Response should answer user's query coherently</td>\n",
" <td>None</td>\n",
" <td>The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment.</td>\n",
" <td>1136</td>\n",
" <td>gpt-3.5-turbo</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>What is a shooting star?</td>\n",
" <td>[Black holes are stars that have collapsed under their own gravity. They are so dense that nothing can escape their gravitational pull, not even light.]</td>\n",
" <td>A shooting star is a meteor that burns up in the atmosphere.</td>\n",
" <td>None</td>\n",
" <td>Response should answer user's query</td>\n",
" <td>False</td>\n",
" <td>The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere.</td>\n",
" <td>1232</td>\n",
" <td>Response should answer user's query coherently</td>\n",
" <td>None</td>\n",
" <td>The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score.</td>\n",
" <td>1074</td>\n",
" <td>gpt-3.5-turbo</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -229,20 +236,20 @@
"1 I cannot answer this question as prices vary from country to country. \n",
"2 A shooting star is a meteor that burns up in the atmosphere. \n",
"\n",
" expected_response display_name failed \\\n",
"0 None Response should answer user's query False \n",
"1 None Response should answer user's query True \n",
"2 None Response should answer user's query False \n",
" expected_response display_name failed \\\n",
"0 None Response should answer user's query coherently None \n",
"1 None Response should answer user's query coherently None \n",
"2 None Response should answer user's query coherently None \n",
"\n",
" grade_reason \\\n",
"0 The response provided the correct answer to the user's query. \n",
"1 The response refuses to answer the user's query, which does not meet the criteria for a pass. \n",
"2 The response provides a clear answer to the user's query, explaining that a shooting star is a meteor that burns up in the atmosphere. \n",
" grade_reason \\\n",
"0 The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score. \n",
"1 The response does not provide any information or context to evaluate the coherence. It lacks necessary details for a proper assessment. \n",
"2 The response does not provide any information or context to evaluate the coherence. It lacks the necessary details to assign a score. \n",
"\n",
" runtime model passed \n",
"0 1350 gpt-3.5-turbo 1.0 \n",
"1 1161 gpt-3.5-turbo 0.0 \n",
"2 1232 gpt-3.5-turbo 1.0 "
" runtime model score \n",
"0 1012 gpt-3.5-turbo 1 \n",
"1 1136 gpt-3.5-turbo 1 \n",
"2 1074 gpt-3.5-turbo 1 "
]
},
"execution_count": 3,
Expand All @@ -254,14 +261,20 @@
"# Checks if the LLM response answers the user query sufficiently\n",
"eval_model = \"gpt-3.5-turbo\"\n",
"eval_prompt = \"\"\"\n",
"If the response refuses to answer the user's query, then fail. Otherwise pass.\n",
"Based on the coherence of response, give the score ranging from 1 to 5.\n",
"\n",
"User Query: {query}\n",
"Response: {response}\"\"\"\n",
"eval_prompt_1 = \"\"\"\n",
"If response answers the query, then pass otherwise fail.\n",
"\n",
"User Query: {query}\n",
"Response: {response}\"\"\"\n",
"CustomPrompt(\n",
" eval_prompt=eval_prompt, \n",
" output_type='numeric',\n",
" model=eval_model, \n",
" display_name=\"Response should answer user's query\",\n",
" display_name=\"Response should answer user's query coherently\",\n",
").run_batch(data=dataset).to_df()"
]
}
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "athina"
version = "1.4.24"
version = "1.4.25"
description = "Python SDK to configure and run evaluations for your LLM-based application"
authors = ["Shiv Sakhuja <[email protected]>", "Akshat Gupta <[email protected]>", "Vivek Aditya <[email protected]>", "Akhil Bisht <[email protected]>"]
readme = "README.md"
Expand Down