Skip to content

Commit

Permalink
Merge pull request #242 from parea-ai/PAI-402-add-example-evaluation-…
Browse files Browse the repository at this point in the history
…functions

Pai 402 add example evaluation functions
  • Loading branch information
joschkabraun committed Nov 27, 2023
2 parents dac7a72 + 9e6e8c4 commit bd9b5ab
Show file tree
Hide file tree
Showing 11 changed files with 863 additions and 35 deletions.
Empty file added parea/evals/__init__.py
Empty file.
39 changes: 39 additions & 0 deletions parea/evals/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from parea.evals.utils import call_openai
from parea.schemas.models import Log


def goal_success_ratio(log: Log) -> float:
"""Returns the average amount of turns the user had to converse with the AI to reach their goals."""
messages = [m.to_dict() for m in log.configuration.messages]

# need to determine where does a new goal start
conversation_segments = []
start_index = 0
end_index = 3
while end_index < len(messages):
user_follows_same_goal = call_openai(
[
{
"role": "system",
"content": "Look at the conversation and to determine if the user is still following the same goal "
"or if they are following a new goal. If they are following the same goal, respond "
"SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!",
}
]
+ messages[start_index:end_index],
model="gpt-4",
)

if user_follows_same_goal == "SAME_GOAL":
end_index += 2
else:
conversation_segments.append(messages[start_index : end_index - 1])
start_index = end_index - 1
end_index += 2

if start_index < len(messages):
conversation_segments.append(messages[start_index:])

# for now assume that the user reached their goal in every segment
# return the average amount of turns the user had to converse with the AI to reach their goals
return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments)
160 changes: 160 additions & 0 deletions parea/evals/general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from typing import Callable

import ast
import re

from parea.evals.utils import call_openai, sent_tokenize
from parea.schemas.models import Log

one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")


def judge_llm_factory(model: str) -> Callable[[Log], float]:
"""Measures the generated response quality by using a LLM on a scale of 1 to 10."""

def _eval_judge_llm(log: Log) -> float:
question = log.inputs["question"]
output = log.output
rating_response = call_openai(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": f"[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response "
f"provided by an AI assistant to the user question displayed below. Your evaluation should "
f"consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and "
f"level of detail of the response. Begin your evaluation by providing a short explanation. "
f"Be as objective as possible. After providing your explanation, you must rate the response "
f'on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example: '
f'"Rating: [[5]]".\n\n[Question]\n{question}\n\n[The Start of Assistant\'s Answer]'
f"\n{output}\n[The End of Assistant's Answer]",
},
],
temperature=0.0,
)
match = re.search(one_score_pattern, rating_response)
if not match:
match = re.search(one_score_pattern_backup, rating_response)

if match:
rating = ast.literal_eval(match.groups()[0])
else:
rating = 0

return rating / 10.0

return _eval_judge_llm


judge_llm_gpt4 = judge_llm_factory("gpt-4")

judge_llm_gpt3t = judge_llm_factory("gpt-3.5-turbo")


def self_check_gpt(log: Log) -> float:
"""Measures how consistent is the output of a model under resampling the response."""
messages = [m.to_dict() for m in log.configuration.messages]

n_sampled_outputs = 5
sampled_outputs = []
for _ in range(n_sampled_outputs):
response = call_openai(
messages=messages,
model=log.configuration.model,
temperature=1.0,
max_tokens=log.configuration.model_params.max_length,
top_p=log.configuration.model_params.top_p,
frequency_penalty=log.configuration.model_params.frequency_penalty,
presence_penalty=log.configuration.model_params.presence_penalty,
)
sampled_outputs.append(response)

sentences = sent_tokenize(log.output)

sentences_scores = []
for sentence in sentences:
scores = []
for sampled_output in sampled_outputs:
response = call_openai(
messages=[
{
"role": "user",
"content": f"""Context: {sampled_output}
Sentence: {sentence}
Is the sentence supported by the context above?
Answer Yes or No:""",
}
],
model="gpt-3.5-turbo",
temperature=0.0,
)
scores.append(float("yes" in response.lower()))
sentences_scores.append(sum(scores) / len(scores))

return sum(sentences_scores) / len(sentences_scores)


def lm_vs_lm_factuality_factory(examiner_model: str = "gpt-3.5-turbo") -> Callable[[Log], float]:
"""Using an examining LLM, measures the factuality of a claim. Examining LLM asks follow-up questions to the other
LLM until it reaches a conclusion."""

def lm_vs_lm_factuality(log: Log) -> float:
output = log.output
messages_examinee = [m.to_dict() for m in log.configuration.messages]

# ask examiner for follow-up questions
setup_prompt = f"""Your goal is to try to verify the correctness of the following claim: {output}, based on the background information you will gather. To gather this, You will provide short questions whose purpose will be to verify the correctness of the claim, and I will reply to you with the answers to these. Hopefully, with the help of the background questions and their answers, you will be able to reach a conclusion as to whether the claim is correct or possibly incorrect. Please keep asking questions as long as you’re yet to be sure regarding the true veracity of the claim. Please start with the first questions."""
messages_examiner = [{"role": "user", "content": setup_prompt}]
follow_up_questions = call_openai(
model=examiner_model,
messages=messages_examiner,
temperature=0.0,
)
messages_examiner += [{"role": "assistant", "content": follow_up_questions}]

follow_up_prompt = """(i) Do you have any follow-up questions? Please answer with Yes or No.
(ii) What are the follow-up questions?"""
# ask examinee follow-up questions until they reach a conclusion
while follow_up_questions is not None:
messages_examinee += [{"role": "user", "content": follow_up_questions}]
follow_up_answers = call_openai(
model=log.configuration.model,
messages=messages_examinee,
temperature=log.configuration.model_params.temp,
top_p=log.configuration.model_params.top_p,
frequency_penalty=log.configuration.model_params.frequency_penalty,
presence_penalty=log.configuration.model_params.presence_penalty,
max_tokens=log.configuration.model_params.max_length,
)
messages_examiner += [
{"role": "assistant", "content": follow_up_answers},
{"role": "user", "content": follow_up_prompt},
]

examiner_response = call_openai(
model=examiner_model,
messages=messages_examiner,
temperature=0.0,
)
messages_examiner += [{"role": "assistant", "content": examiner_response}]
if "yes" in examiner_response.lower():
follow_up_questions = examiner_response
messages_examinee += [{"role": "assistant", "content": follow_up_answers}]
else:
follow_up_questions = None

# ask examiner for their conclusion
factuality_decision_prompt = (
"""Based on the interviewee’s answers to your questions, what is your conclusion regarding the correctness of the claim? Do you think it is correct or incorrect?"""
)
messages_examiner += [{"role": "user", "content": factuality_decision_prompt}]
examiner_response = call_openai(
model=examiner_model,
messages=messages_examiner,
temperature=0.0,
)
return float("incorrect" not in examiner_response.lower())

return lm_vs_lm_factuality
Loading

0 comments on commit bd9b5ab

Please sign in to comment.