Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

non boolean custom evals #82

Merged
merged 5 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 85 additions & 1 deletion athina/evals/llm/custom_prompt/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time
from athina.helpers.logger import logger
from typing import List, Optional, Dict
from jinja2 import Environment
from athina.helpers.jinja_helper import PreserveUndefined
Expand All @@ -6,14 +8,16 @@
from ..llm_evaluator import LlmEvaluator
from athina.evals.eval_type import LlmEvalTypeId
from ..example import FewShotExample

from athina.interfaces.result import EvalResult, EvalResultMetric
from athina.metrics.metric_type import MetricType

class CustomPrompt(LlmEvaluator):
"""
This evaluator can be configured with custom examples and instructions.
"""

_eval_prompt: Optional[str] = None
_output_type: Optional[str] = None
_display_name: str = None
_metric_ids: List[str] = None
_model: str = None
Expand All @@ -23,6 +27,7 @@ class CustomPrompt(LlmEvaluator):
def __init__(
self,
eval_prompt: str,
output_type: str = None,
display_name: str = None,
metric_ids: List[str] = None,
model: str = None,
Expand All @@ -37,6 +42,7 @@ def __init__(
raise ValueError("model is not defined")

self._eval_prompt = eval_prompt
self._output_type = output_type
self._display_name = display_name
self._metric_ids = metric_ids
self._model = model
Expand Down Expand Up @@ -95,3 +101,81 @@ def is_failure(self, result) -> Optional[bool]:
def _user_message(self, **kwargs) -> str:
template = self.env.from_string(self._user_message_template)
return template.render(**kwargs)

def _system_message(self) -> str:
if self._output_type == 'boolean':
return (
"### INSTRUCTIONS ###\n"
"You are an expert at evaluating responses by an AI.\n"
"Based on the instructions provided, you will evaluate the response and determine if it passes or fails.\n"
"You MUST return a JSON object with the following fields:\n"
"- result: Result must be either 'Pass' or 'Fail'.\n"
"- explanation: An explanation of why the result is Pass or Fail.\n"
)
elif self._output_type == 'numeric':
return (
"### INSTRUCTIONS ###\n"
"You are an expert at evaluating responses by an AI.\n"
"Based on the instructions provided, you will evaluate the response and provide a score.\n"
"You MUST return a JSON object with the following fields:\n"
"- score: The score based on the provided grading criteria.\n"
"- explanation: An explanation of the score.\n"
)
else:
return super()._system_message()

def _evaluate(self, **kwargs) -> EvalResult:
"""
Run the LLM evaluator.
"""
start_time = time.time()
# Validate that correct args were passed
self.validate_args(**kwargs)

# Construct Prompt
messages = self._prompt_messages(**kwargs)

# Run the LLM Completion

chat_completion_response_json: dict = self.llm_service.json_completion(
model=self._model,
messages=messages,
temperature=self.TEMPERATURE,
)

metrics = []
try:
if self._output_type == 'boolean':
result = chat_completion_response_json["result"]
explanation = chat_completion_response_json["explanation"]
print(f"result: {result}")
print(f"explanation: {explanation}")
failure = self.is_failure(result)
passed_value = 1 - float(failure)
metrics.append(EvalResultMetric(id=MetricType.PASSED.value, value=passed_value))
elif self._output_type == 'numeric':
score = chat_completion_response_json["score"]
explanation = chat_completion_response_json["explanation"]
print(f"score: {score}")
print(f"explanation: {explanation}")
metrics.append(EvalResultMetric(id=MetricType.SCORE.value, value=score))
failure = None # Numeric evaluations don't have a pass/fail result

except Exception as e:
logger.error(f"Error occurred during eval: {e}")
raise e

end_time = time.time()
eval_runtime_ms = int((end_time - start_time) * 1000)
llm_eval_result = EvalResult(
name=self.name,
display_name=self.display_name,
data=kwargs,
failure=failure,
reason=explanation,
runtime=eval_runtime_ms,
model=self._model,
metrics=metrics,
)
return {k: v for k, v in llm_eval_result.items() if v is not None}

1 change: 1 addition & 0 deletions athina/metrics/metric_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class MetricType(Enum):
GROUNDEDNESS = "groundedness"
PASSED = "passed"
SIMILARITY_SCORE = "similarity_score"
SCORE = "score"

# Conversation Metrics
CONVERSATION_RESOLUTION = "conversation_resolution"
Expand Down
Loading