Merge pull request #248 from parea-ai/feat-eval-funcs-2

feat: add guardrails for llm vs llm
parea-ai · Nov 28, 2023 · 8c1729b · 8c1729b
2 parents 33beae0 + 892fbf5
commit 8c1729b
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 14 deletions.
diff --git a/parea/evals/general.py b/parea/evals/general.py
@@ -10,11 +10,11 @@
 one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
 
 
-def judge_llm_factory(model: str) -> Callable[[Log], float]:
+def judge_llm_factory(model: str, question_field: str = "question") -> Callable[[Log], float]:
     """Measures the generated response quality by using a LLM on a scale of 1 to 10."""
 
     def _eval_judge_llm(log: Log) -> float:
-        question = log.inputs["question"]
+        question = log.inputs[question_field]
         output = log.output
         rating_response = call_openai(
             model=model,
@@ -50,11 +50,14 @@ def _eval_judge_llm(log: Log) -> float:
 
 judge_llm_gpt4 = judge_llm_factory("gpt-4")
 
-judge_llm_gpt3t = judge_llm_factory("gpt-3.5-turbo")
+judge_llm_gpt3t = judge_llm_factory("gpt-3.5-turbo-16k")
 
 
 def self_check_gpt(log: Log) -> float:
     """Measures how consistent is the output of a model under resampling the response."""
+    if log.configuration is None or log.configuration.messages is None:
+        return 0.0
+
     messages = [m.to_dict() for m in log.configuration.messages]
 
     n_sampled_outputs = 5
@@ -73,6 +76,9 @@ def self_check_gpt(log: Log) -> float:
 
     sentences = sent_tokenize(log.output)
 
+    if len(sentences) == 0:
+        return 0.0
+
     sentences_scores = []
     for sentence in sentences:
         scores = []
@@ -105,14 +111,15 @@ def lm_vs_lm_factuality(log: Log) -> float:
         messages_examinee = [m.to_dict() for m in log.configuration.messages]
 
         # ask examiner for follow-up questions
-        setup_prompt = f"""Your goal is to try to verify the correctness of the following claim: {output}, based on the background information you will gather. To gather this, You will provide short questions whose purpose will be to verify the correctness of the claim, and I will reply to you with the answers to these. Hopefully, with the help of the background questions and their answers, you will be able to reach a conclusion as to whether the claim is correct or possibly incorrect. Please keep asking questions as long as you’re yet to be sure regarding the true veracity of the claim. Please start with the first questions."""
+        setup_prompt = f"""Your goal is to try to verify the correctness of the following claim: "{output}", based on the background information you will gather. To gather this, You will provide short questions whose purpose will be to verify the correctness of the claim, and I will reply to you with the answers to these. Hopefully, with the help of the background questions and their answers, you will be able to reach a conclusion as to whether the claim is correct or possibly incorrect. Please keep asking questions as long as you’re yet to be sure regarding the true veracity of the claim. Please start with the first questions."""
         messages_examiner = [{"role": "user", "content": setup_prompt}]
         follow_up_questions = call_openai(
             model=examiner_model,
             messages=messages_examiner,
             temperature=0.0,
         )
         messages_examiner += [{"role": "assistant", "content": follow_up_questions}]
+        n_rounds_follow_up_questions = 1
 
         follow_up_prompt = """(i) Do you have any follow-up questions? Please answer with Yes or No.
     (ii) What are the follow-up questions?"""
@@ -128,10 +135,13 @@ def lm_vs_lm_factuality(log: Log) -> float:
                 presence_penalty=log.configuration.model_params.presence_penalty,
                 max_tokens=log.configuration.model_params.max_length,
             )
-            messages_examiner += [
-                {"role": "assistant", "content": follow_up_answers},
-                {"role": "user", "content": follow_up_prompt},
-            ]
+            messages_examiner.append({"role": "assistant", "content": follow_up_answers})
+
+            if n_rounds_follow_up_questions > 3:
+                break
+            else:
+                messages_examiner.append({"role": "user", "content": follow_up_prompt})
+                n_rounds_follow_up_questions += 1
 
             examiner_response = call_openai(
                 model=examiner_model,
@@ -158,3 +168,8 @@ def lm_vs_lm_factuality(log: Log) -> float:
         return float("incorrect" not in examiner_response.lower())
 
     return lm_vs_lm_factuality
+
+
+lm_vs_lm_factuality_gpt4 = lm_vs_lm_factuality_factory("gpt-4")
+
+lm_vs_lm_factuality_gpt3t = lm_vs_lm_factuality_factory("gpt-3.5-turbo-16k")
diff --git a/parea/evals/rag.py b/parea/evals/rag.py
@@ -72,7 +72,7 @@ def llm_critique_faithfulness(log: Log) -> float:
 
 def recall_response(log: Log) -> float:
     """Prop. of tokens in target/reference answer which are also in model generation."""
-    target = log.inputs["target"]
+    target = log.target
     output = log.output
 
     provider = log.configuration.provider
@@ -240,7 +240,7 @@ def ragas_answer_context_faithfulness(log: Log) -> float:
     return ragas_answer_context_faithfulness
 
 
-def ragas_answer_relevancy_factor(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]:
+def ragas_answer_relevancy_factory(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]:
     """Quantifies how much the generated answer relates to the query."""
     try:
         import numpy as np
@@ -290,8 +290,8 @@ def ragas_context_ranking_factory(question_field: str = "question", context_fiel
 
     def ragas_context_ranking(log: Log) -> float:
         """Quantifies if the retrieved context is ranked by their relevancy"""
-        contexts = [log.inputs[context_field] for context_field in context_fields]
         question = log.inputs[question_field]
+        contexts = [log.inputs[context_field] for context_field in context_fields]
 
         verifications = []
         for context in contexts:

diff --git a/parea/evals/utils.py b/parea/evals/utils.py
@@ -61,6 +61,6 @@ def call_openai(messages, model, temperature=1.0, max_tokens=None, top_p=1.0, fr
 
 def embed(model, input) -> List[float]:
     if openai_version.startswith("0."):
-        return openai.Embedding.create(model=model, input=input, encoding_format="float")["embedding"]
+        return openai.Embedding.create(model=model, input=input, encoding_format="float").data[0]["embedding"]
     else:
-        return openai.embeddings.create(model=model, input=input, encoding_format="float").embedding
+        return openai.embeddings.create(model=model, input=input, encoding_format="float").data[0].embedding
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "parea-ai"
 packages = [{ include = "parea" }]
-version = "0.2.17"
+version = "0.2.18"
 description = "Parea python sdk"
 readme = "README.md"
 authors = ["joel-parea-ai <[email protected]>"]