Skip to content

Commit

Permalink
Merge pull request #259 from parea-ai/refactor-blogpost-reference-eva…
Browse files Browse the repository at this point in the history
…luations

feat: add context ranking listwise
  • Loading branch information
joschkabraun committed Dec 11, 2023
2 parents 254ff6b + 64cbc09 commit 5e252b2
Show file tree
Hide file tree
Showing 26 changed files with 808 additions and 622 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ local CSV file if you don't have a Parea API key.

Evaluation functions receive an argument `log` (of type [Log](parea/schemas/models.py)) and should return a
float between 0 (bad) and 1 (good) inclusive. You don't need to start from scratch, there are pre-defined evaluation
functions for [general purpose](parea/evals/general.py),
[chat](parea/evals/chat.py), [RAG](parea/evals/rag.py), and [summarization](parea/evals/summary.py) apps :)
functions for [general purpose](parea/evals/general),
[chat](parea/evals/chat), [RAG](parea/evals/rag), and [summarization](parea/evals/summary) apps :)

You can define evaluation functions locally or use the ones you have deployed to
Parea's [Test Hub](https://app.parea.ai/test-hub).
Expand Down
1 change: 1 addition & 0 deletions parea/evals/chat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .goal_success_ratio import goal_success_ratio_factory
File renamed without changes.
6 changes: 6 additions & 0 deletions parea/evals/general/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .answer_matches_target_llm_grader import answer_matches_target_llm_grader_factory
from .answer_matches_target_recall import answer_matches_target_recall
from .answer_relevancy import answer_relevancy_factory
from .llm_grader import llm_grader_factory, llm_grader_gpt3t, llm_grader_gpt4
from .lm_vs_lm import lm_vs_lm_factuality_factory, lm_vs_lm_factuality_gpt3t, lm_vs_lm_factuality_gpt4
from .self_check import self_check
36 changes: 36 additions & 0 deletions parea/evals/general/answer_matches_target_llm_grader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import Callable, Optional

from parea.evals.utils import call_openai
from parea.schemas.log import Log


def answer_matches_target_llm_grader_factory(
question_field: Optional[str] = "question",
model: Optional[str] = "gpt-4",
) -> Callable[[Log], float]:
"""Quantifies how much the generated answer matches the ground truth / target."""

def answer_matches_target_llm_grader(log: Log) -> float:
question = log.inputs[question_field]
output = log.output
target = log.target
response = call_openai(
model=model,
messages=[
{"role": "system", "content": "You are CompareGPT, a machine to verify the groundedness of predictions. Answer with " "only yes/no."},
{
"role": "user",
"content": f"""You are given a question, the corresponding ground-truth answer and a prediction from a model. Compare the "Ground-truth answer" and the "Prediction" to determine whether the prediction correctly answers the question. All information in the ground-truth answer must be present in the prediction, including numbers and dates. You must answer "no" if there are any specific details in the ground-truth answer that are not mentioned in the prediction. There should be no contradicting statements in the prediction. The prediction may contain extra information. If the prediction states something as a possibility, treat it as a definitive answer.
Question: {question}
Ground-truth answer: {target}
Prediction: {output}
CompareGPT response:""",
},
],
temperature=0.0,
)
return float("yes" in response.lower())

return answer_matches_target_llm_grader
27 changes: 27 additions & 0 deletions parea/evals/general/answer_matches_target_recall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from collections import Counter

from parea.schemas.log import Log


def answer_matches_target_recall(log: Log) -> float:
"""Prop. of tokens in target/reference answer which are also in model generation."""
target = log.target
output = log.output

provider = log.configuration.provider
model = log.configuration.model

if provider == "openai":
import tiktoken

encoding = tiktoken.encoding_for_model(model)
target_tokens = encoding.encode(target)
output_tokens = encoding.encode(output)
else:
raise NotImplementedError

if len(target_tokens) == 0:
return 1.0
common_tokens = Counter(target_tokens) & Counter(output_tokens)
num_common = sum(common_tokens.values())
return num_common / len(target_tokens)
44 changes: 44 additions & 0 deletions parea/evals/general/answer_relevancy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import Callable

from parea.evals.utils import call_openai, embed
from parea.schemas.log import Log


def answer_relevancy_factory(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]:
"""Quantifies how much the generated answer relates to the query."""
try:
import numpy as np
except ImportError:
raise ImportError("Please install numpy to use this metric.")

def answer_relevancy(log: Log) -> float:
"""Quantifies how much the generated answer relates to the query."""
question = log.inputs[question_field]
output = log.output

generated_questions = call_openai(
model="gpt-3.5-turbo-16k",
messages=[
{
"role": "user",
"content": f"""\
Generate question for the given answer.
Answer:\nThe PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India
Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?
Answer: {output}
Question:""",
}
],
temperature=0.0,
n=n_generations,
)
embedded_generated_questions = [embed(model="text-embedding-ada-002", input=q) for q in generated_questions]
embedded_question = embed(model="text-embedding-ada-002", input=question)

question_vec = np.asarray(embedded_question).reshape(1, -1)
gen_question_vec = np.asarray(embedded_generated_questions)
norm = np.linalg.norm(gen_question_vec, axis=1) * np.linalg.norm(question_vec, axis=1)
return (np.dot(gen_question_vec, question_vec.T).reshape(-1) / norm).mean()

return answer_relevancy
52 changes: 52 additions & 0 deletions parea/evals/general/llm_grader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Callable

import ast
import re

from parea.evals.utils import call_openai
from parea.schemas.log import Log

one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")


def llm_grader_factory(model: str, question_field: str = "question") -> Callable[[Log], float]:
"""Measures the generated response quality by using a LLM on a scale of 1 to 10."""

def llm_grader(log: Log) -> float:
question = log.inputs[question_field]
output = log.output
rating_response = call_openai(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": f"[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response "
f"provided by an AI assistant to the user question displayed below. Your evaluation should "
f"consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and "
f"level of detail of the response. Begin your evaluation by providing a short explanation. "
f"Be as objective as possible. After providing your explanation, you must rate the response "
f'on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example: '
f'"Rating: [[5]]".\n\n[Question]\n{question}\n\n[The Start of Assistant\'s Answer]'
f"\n{output}\n[The End of Assistant's Answer]",
},
],
temperature=0.0,
)
match = re.search(one_score_pattern, rating_response)
if not match:
match = re.search(one_score_pattern_backup, rating_response)

if match:
rating = ast.literal_eval(match.groups()[0])
else:
rating = 0

return rating / 10.0

return llm_grader


llm_grader_gpt4 = llm_grader_factory("gpt-4")
llm_grader_gpt3t = llm_grader_factory("gpt-3.5-turbo-16k")
101 changes: 1 addition & 100 deletions parea/evals/general.py → parea/evals/general/lm_vs_lm.py
Original file line number Diff line number Diff line change
@@ -1,106 +1,8 @@
from typing import Callable

import ast
import re

from parea.evals.utils import call_openai, sent_tokenize
from parea.evals.utils import call_openai
from parea.schemas.log import Log

one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")


def judge_llm_factory(model: str, question_field: str = "question") -> Callable[[Log], float]:
"""Measures the generated response quality by using a LLM on a scale of 1 to 10."""

def _eval_judge_llm(log: Log) -> float:
question = log.inputs[question_field]
output = log.output
rating_response = call_openai(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": f"[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response "
f"provided by an AI assistant to the user question displayed below. Your evaluation should "
f"consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and "
f"level of detail of the response. Begin your evaluation by providing a short explanation. "
f"Be as objective as possible. After providing your explanation, you must rate the response "
f'on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example: '
f'"Rating: [[5]]".\n\n[Question]\n{question}\n\n[The Start of Assistant\'s Answer]'
f"\n{output}\n[The End of Assistant's Answer]",
},
],
temperature=0.0,
)
match = re.search(one_score_pattern, rating_response)
if not match:
match = re.search(one_score_pattern_backup, rating_response)

if match:
rating = ast.literal_eval(match.groups()[0])
else:
rating = 0

return rating / 10.0

return _eval_judge_llm


judge_llm_gpt4 = judge_llm_factory("gpt-4")

judge_llm_gpt3t = judge_llm_factory("gpt-3.5-turbo-16k")


def self_check_gpt(log: Log) -> float:
"""Measures how consistent is the output of a model under resampling the response."""
if log.configuration is None or log.configuration.messages is None:
return 0.0

messages = [m.to_dict() for m in log.configuration.messages]

n_sampled_outputs = 5
sampled_outputs = []
for _ in range(n_sampled_outputs):
response = call_openai(
messages=messages,
model=log.configuration.model,
temperature=1.0,
max_tokens=log.configuration.model_params.max_length,
top_p=log.configuration.model_params.top_p,
frequency_penalty=log.configuration.model_params.frequency_penalty,
presence_penalty=log.configuration.model_params.presence_penalty,
)
sampled_outputs.append(response)

sentences = sent_tokenize(log.output)

if len(sentences) == 0:
return 0.0

sentences_scores = []
for sentence in sentences:
scores = []
for sampled_output in sampled_outputs:
response = call_openai(
messages=[
{
"role": "user",
"content": f"""Context: {sampled_output}
Sentence: {sentence}
Is the sentence supported by the context above?
Answer Yes or No:""",
}
],
model="gpt-3.5-turbo",
temperature=0.0,
)
scores.append(float("yes" in response.lower()))
sentences_scores.append(sum(scores) / len(scores))

return sum(sentences_scores) / len(sentences_scores)


def lm_vs_lm_factuality_factory(examiner_model: str = "gpt-3.5-turbo") -> Callable[[Log], float]:
"""Using an examining LLM, measures the factuality of a claim. Examining LLM asks follow-up questions to the other
Expand Down Expand Up @@ -171,5 +73,4 @@ def lm_vs_lm_factuality(log: Log) -> float:


lm_vs_lm_factuality_gpt4 = lm_vs_lm_factuality_factory("gpt-4")

lm_vs_lm_factuality_gpt3t = lm_vs_lm_factuality_factory("gpt-3.5-turbo-16k")
51 changes: 51 additions & 0 deletions parea/evals/general/self_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from parea.evals.utils import call_openai, sent_tokenize
from parea.schemas.log import Log


def self_check(log: Log) -> float:
"""Measures how consistent is the output of a model under resampling the response."""
if log.configuration is None or log.configuration.messages is None:
return 0.0

messages = [m.to_dict() for m in log.configuration.messages]

n_sampled_outputs = 5
sampled_outputs = []
for _ in range(n_sampled_outputs):
response = call_openai(
messages=messages,
model=log.configuration.model,
temperature=1.0,
max_tokens=log.configuration.model_params.max_length,
top_p=log.configuration.model_params.top_p,
frequency_penalty=log.configuration.model_params.frequency_penalty,
presence_penalty=log.configuration.model_params.presence_penalty,
)
sampled_outputs.append(response)

sentences = sent_tokenize(log.output)

if len(sentences) == 0:
return 0.0

sentences_scores = []
for sentence in sentences:
scores = []
for sampled_output in sampled_outputs:
response = call_openai(
messages=[
{
"role": "user",
"content": f"""Context: {sampled_output}
Sentence: {sentence}
Is the sentence supported by the context above?
Answer Yes or No:""",
}
],
model="gpt-3.5-turbo",
temperature=0.0,
)
scores.append(float("yes" in response.lower()))
sentences_scores.append(sum(scores) / len(scores))

return sum(sentences_scores) / len(sentences_scores)
Loading

0 comments on commit 5e252b2

Please sign in to comment.