Skip to content

Commit

Permalink
Merge pull request #248 from parea-ai/feat-eval-funcs-2
Browse files Browse the repository at this point in the history
feat: add guardrails for llm vs llm
  • Loading branch information
joschkabraun committed Nov 28, 2023
2 parents 33beae0 + 892fbf5 commit 8c1729b
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 14 deletions.
31 changes: 23 additions & 8 deletions parea/evals/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")


def judge_llm_factory(model: str) -> Callable[[Log], float]:
def judge_llm_factory(model: str, question_field: str = "question") -> Callable[[Log], float]:
"""Measures the generated response quality by using a LLM on a scale of 1 to 10."""

def _eval_judge_llm(log: Log) -> float:
question = log.inputs["question"]
question = log.inputs[question_field]
output = log.output
rating_response = call_openai(
model=model,
Expand Down Expand Up @@ -50,11 +50,14 @@ def _eval_judge_llm(log: Log) -> float:

judge_llm_gpt4 = judge_llm_factory("gpt-4")

judge_llm_gpt3t = judge_llm_factory("gpt-3.5-turbo")
judge_llm_gpt3t = judge_llm_factory("gpt-3.5-turbo-16k")


def self_check_gpt(log: Log) -> float:
"""Measures how consistent is the output of a model under resampling the response."""
if log.configuration is None or log.configuration.messages is None:
return 0.0

messages = [m.to_dict() for m in log.configuration.messages]

n_sampled_outputs = 5
Expand All @@ -73,6 +76,9 @@ def self_check_gpt(log: Log) -> float:

sentences = sent_tokenize(log.output)

if len(sentences) == 0:
return 0.0

sentences_scores = []
for sentence in sentences:
scores = []
Expand Down Expand Up @@ -105,14 +111,15 @@ def lm_vs_lm_factuality(log: Log) -> float:
messages_examinee = [m.to_dict() for m in log.configuration.messages]

# ask examiner for follow-up questions
setup_prompt = f"""Your goal is to try to verify the correctness of the following claim: {output}, based on the background information you will gather. To gather this, You will provide short questions whose purpose will be to verify the correctness of the claim, and I will reply to you with the answers to these. Hopefully, with the help of the background questions and their answers, you will be able to reach a conclusion as to whether the claim is correct or possibly incorrect. Please keep asking questions as long as you’re yet to be sure regarding the true veracity of the claim. Please start with the first questions."""
setup_prompt = f"""Your goal is to try to verify the correctness of the following claim: "{output}", based on the background information you will gather. To gather this, You will provide short questions whose purpose will be to verify the correctness of the claim, and I will reply to you with the answers to these. Hopefully, with the help of the background questions and their answers, you will be able to reach a conclusion as to whether the claim is correct or possibly incorrect. Please keep asking questions as long as you’re yet to be sure regarding the true veracity of the claim. Please start with the first questions."""
messages_examiner = [{"role": "user", "content": setup_prompt}]
follow_up_questions = call_openai(
model=examiner_model,
messages=messages_examiner,
temperature=0.0,
)
messages_examiner += [{"role": "assistant", "content": follow_up_questions}]
n_rounds_follow_up_questions = 1

follow_up_prompt = """(i) Do you have any follow-up questions? Please answer with Yes or No.
(ii) What are the follow-up questions?"""
Expand All @@ -128,10 +135,13 @@ def lm_vs_lm_factuality(log: Log) -> float:
presence_penalty=log.configuration.model_params.presence_penalty,
max_tokens=log.configuration.model_params.max_length,
)
messages_examiner += [
{"role": "assistant", "content": follow_up_answers},
{"role": "user", "content": follow_up_prompt},
]
messages_examiner.append({"role": "assistant", "content": follow_up_answers})

if n_rounds_follow_up_questions > 3:
break
else:
messages_examiner.append({"role": "user", "content": follow_up_prompt})
n_rounds_follow_up_questions += 1

examiner_response = call_openai(
model=examiner_model,
Expand All @@ -158,3 +168,8 @@ def lm_vs_lm_factuality(log: Log) -> float:
return float("incorrect" not in examiner_response.lower())

return lm_vs_lm_factuality


lm_vs_lm_factuality_gpt4 = lm_vs_lm_factuality_factory("gpt-4")

lm_vs_lm_factuality_gpt3t = lm_vs_lm_factuality_factory("gpt-3.5-turbo-16k")
6 changes: 3 additions & 3 deletions parea/evals/rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def llm_critique_faithfulness(log: Log) -> float:

def recall_response(log: Log) -> float:
"""Prop. of tokens in target/reference answer which are also in model generation."""
target = log.inputs["target"]
target = log.target
output = log.output

provider = log.configuration.provider
Expand Down Expand Up @@ -240,7 +240,7 @@ def ragas_answer_context_faithfulness(log: Log) -> float:
return ragas_answer_context_faithfulness


def ragas_answer_relevancy_factor(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]:
def ragas_answer_relevancy_factory(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]:
"""Quantifies how much the generated answer relates to the query."""
try:
import numpy as np
Expand Down Expand Up @@ -290,8 +290,8 @@ def ragas_context_ranking_factory(question_field: str = "question", context_fiel

def ragas_context_ranking(log: Log) -> float:
"""Quantifies if the retrieved context is ranked by their relevancy"""
contexts = [log.inputs[context_field] for context_field in context_fields]
question = log.inputs[question_field]
contexts = [log.inputs[context_field] for context_field in context_fields]

verifications = []
for context in contexts:
Expand Down
4 changes: 2 additions & 2 deletions parea/evals/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,6 @@ def call_openai(messages, model, temperature=1.0, max_tokens=None, top_p=1.0, fr

def embed(model, input) -> List[float]:
if openai_version.startswith("0."):
return openai.Embedding.create(model=model, input=input, encoding_format="float")["embedding"]
return openai.Embedding.create(model=model, input=input, encoding_format="float").data[0]["embedding"]
else:
return openai.embeddings.create(model=model, input=input, encoding_format="float").embedding
return openai.embeddings.create(model=model, input=input, encoding_format="float").data[0].embedding
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "parea-ai"
packages = [{ include = "parea" }]
version = "0.2.17"
version = "0.2.18"
description = "Parea python sdk"
readme = "README.md"
authors = ["joel-parea-ai <[email protected]>"]
Expand Down

0 comments on commit 8c1729b

Please sign in to comment.