Merge pull request #259 from parea-ai/refactor-blogpost-reference-eva…

…luations feat: add context ranking listwise
parea-ai · Dec 11, 2023 · 5e252b2 · 5e252b2
2 parents 254ff6b + 64cbc09
commit 5e252b2
Show file tree

Hide file tree

Showing 26 changed files with 808 additions and 622 deletions.
diff --git a/README.md b/README.md
@@ -35,8 +35,8 @@ local CSV file if you don't have a Parea API key.
 
 Evaluation functions receive an argument `log` (of type [Log](parea/schemas/models.py)) and should return a
 float between 0 (bad) and 1 (good) inclusive. You don't need to start from scratch, there are pre-defined evaluation
-functions for [general purpose](parea/evals/general.py),
-[chat](parea/evals/chat.py), [RAG](parea/evals/rag.py), and [summarization](parea/evals/summary.py) apps :)
+functions for [general purpose](parea/evals/general),
+[chat](parea/evals/chat), [RAG](parea/evals/rag), and [summarization](parea/evals/summary) apps :)
 
 You can define evaluation functions locally or use the ones you have deployed to
 Parea's [Test Hub](https://app.parea.ai/test-hub).

diff --git a/parea/evals/chat/__init__.py b/parea/evals/chat/__init__.py
@@ -0,0 +1 @@
+from .goal_success_ratio import goal_success_ratio_factory
diff --git a/parea/evals/chat.py → parea/evals/chat/goal_success_ratio.py b/parea/evals/chat.py → parea/evals/chat/goal_success_ratio.py
diff --git a/parea/evals/general/__init__.py b/parea/evals/general/__init__.py
@@ -0,0 +1,6 @@
+from .answer_matches_target_llm_grader import answer_matches_target_llm_grader_factory
+from .answer_matches_target_recall import answer_matches_target_recall
+from .answer_relevancy import answer_relevancy_factory
+from .llm_grader import llm_grader_factory, llm_grader_gpt3t, llm_grader_gpt4
+from .lm_vs_lm import lm_vs_lm_factuality_factory, lm_vs_lm_factuality_gpt3t, lm_vs_lm_factuality_gpt4
+from .self_check import self_check
diff --git a/parea/evals/general/answer_matches_target_llm_grader.py b/parea/evals/general/answer_matches_target_llm_grader.py
@@ -0,0 +1,36 @@
+from typing import Callable, Optional
+
+from parea.evals.utils import call_openai
+from parea.schemas.log import Log
+
+
+def answer_matches_target_llm_grader_factory(
+    question_field: Optional[str] = "question",
+    model: Optional[str] = "gpt-4",
+) -> Callable[[Log], float]:
+    """Quantifies how much the generated answer matches the ground truth / target."""
+
+    def answer_matches_target_llm_grader(log: Log) -> float:
+        question = log.inputs[question_field]
+        output = log.output
+        target = log.target
+        response = call_openai(
+            model=model,
+            messages=[
+                {"role": "system", "content": "You are CompareGPT, a machine to verify the groundedness of predictions. Answer with " "only yes/no."},
+                {
+                    "role": "user",
+                    "content": f"""You are given a question, the corresponding ground-truth answer and a prediction from a model. Compare the "Ground-truth answer" and the "Prediction" to determine whether the prediction correctly answers the question. All information in the ground-truth answer must be present in the prediction, including numbers and dates. You must answer "no" if there are any specific details in the ground-truth answer that are not mentioned in the prediction. There should be no contradicting statements in the prediction. The prediction may contain extra information. If the prediction states something as a possibility, treat it as a definitive answer.
+
+Question: {question}
+Ground-truth answer: {target}
+Prediction: {output}
+
+CompareGPT response:""",
+                },
+            ],
+            temperature=0.0,
+        )
+        return float("yes" in response.lower())
+
+    return answer_matches_target_llm_grader
diff --git a/parea/evals/general/answer_matches_target_recall.py b/parea/evals/general/answer_matches_target_recall.py
@@ -0,0 +1,27 @@
+from collections import Counter
+
+from parea.schemas.log import Log
+
+
+def answer_matches_target_recall(log: Log) -> float:
+    """Prop. of tokens in target/reference answer which are also in model generation."""
+    target = log.target
+    output = log.output
+
+    provider = log.configuration.provider
+    model = log.configuration.model
+
+    if provider == "openai":
+        import tiktoken
+
+        encoding = tiktoken.encoding_for_model(model)
+        target_tokens = encoding.encode(target)
+        output_tokens = encoding.encode(output)
+    else:
+        raise NotImplementedError
+
+    if len(target_tokens) == 0:
+        return 1.0
+    common_tokens = Counter(target_tokens) & Counter(output_tokens)
+    num_common = sum(common_tokens.values())
+    return num_common / len(target_tokens)
diff --git a/parea/evals/general/answer_relevancy.py b/parea/evals/general/answer_relevancy.py
@@ -0,0 +1,44 @@
+from typing import Callable
+
+from parea.evals.utils import call_openai, embed
+from parea.schemas.log import Log
+
+
+def answer_relevancy_factory(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]:
+    """Quantifies how much the generated answer relates to the query."""
+    try:
+        import numpy as np
+    except ImportError:
+        raise ImportError("Please install numpy to use this metric.")
+
+    def answer_relevancy(log: Log) -> float:
+        """Quantifies how much the generated answer relates to the query."""
+        question = log.inputs[question_field]
+        output = log.output
+
+        generated_questions = call_openai(
+            model="gpt-3.5-turbo-16k",
+            messages=[
+                {
+                    "role": "user",
+                    "content": f"""\
+Generate question for the given answer.
+Answer:\nThe PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India
+Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from?
+
+Answer: {output}
+Question:""",
+                }
+            ],
+            temperature=0.0,
+            n=n_generations,
+        )
+        embedded_generated_questions = [embed(model="text-embedding-ada-002", input=q) for q in generated_questions]
+        embedded_question = embed(model="text-embedding-ada-002", input=question)
+
+        question_vec = np.asarray(embedded_question).reshape(1, -1)
+        gen_question_vec = np.asarray(embedded_generated_questions)
+        norm = np.linalg.norm(gen_question_vec, axis=1) * np.linalg.norm(question_vec, axis=1)
+        return (np.dot(gen_question_vec, question_vec.T).reshape(-1) / norm).mean()
+
+    return answer_relevancy
diff --git a/parea/evals/general/llm_grader.py b/parea/evals/general/llm_grader.py
@@ -0,0 +1,52 @@
+from typing import Callable
+
+import ast
+import re
+
+from parea.evals.utils import call_openai
+from parea.schemas.log import Log
+
+one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
+one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
+
+
+def llm_grader_factory(model: str, question_field: str = "question") -> Callable[[Log], float]:
+    """Measures the generated response quality by using a LLM on a scale of 1 to 10."""
+
+    def llm_grader(log: Log) -> float:
+        question = log.inputs[question_field]
+        output = log.output
+        rating_response = call_openai(
+            model=model,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {
+                    "role": "user",
+                    "content": f"[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response "
+                    f"provided by an AI assistant to the user question displayed below. Your evaluation should "
+                    f"consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and "
+                    f"level of detail of the response. Begin your evaluation by providing a short explanation. "
+                    f"Be as objective as possible. After providing your explanation, you must rate the response "
+                    f'on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example: '
+                    f'"Rating: [[5]]".\n\n[Question]\n{question}\n\n[The Start of Assistant\'s Answer]'
+                    f"\n{output}\n[The End of Assistant's Answer]",
+                },
+            ],
+            temperature=0.0,
+        )
+        match = re.search(one_score_pattern, rating_response)
+        if not match:
+            match = re.search(one_score_pattern_backup, rating_response)
+
+        if match:
+            rating = ast.literal_eval(match.groups()[0])
+        else:
+            rating = 0
+
+        return rating / 10.0
+
+    return llm_grader
+
+
+llm_grader_gpt4 = llm_grader_factory("gpt-4")
+llm_grader_gpt3t = llm_grader_factory("gpt-3.5-turbo-16k")
diff --git a/parea/evals/general.py → parea/evals/general/lm_vs_lm.py b/parea/evals/general.py → parea/evals/general/lm_vs_lm.py
@@ -1,106 +1,8 @@
 from typing import Callable
 
-import ast
-import re
-
-from parea.evals.utils import call_openai, sent_tokenize
+from parea.evals.utils import call_openai
 from parea.schemas.log import Log
 
-one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
-one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
-
-
-def judge_llm_factory(model: str, question_field: str = "question") -> Callable[[Log], float]:
-    """Measures the generated response quality by using a LLM on a scale of 1 to 10."""
-
-    def _eval_judge_llm(log: Log) -> float:
-        question = log.inputs[question_field]
-        output = log.output
-        rating_response = call_openai(
-            model=model,
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant."},
-                {
-                    "role": "user",
-                    "content": f"[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response "
-                    f"provided by an AI assistant to the user question displayed below. Your evaluation should "
-                    f"consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and "
-                    f"level of detail of the response. Begin your evaluation by providing a short explanation. "
-                    f"Be as objective as possible. After providing your explanation, you must rate the response "
-                    f'on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example: '
-                    f'"Rating: [[5]]".\n\n[Question]\n{question}\n\n[The Start of Assistant\'s Answer]'
-                    f"\n{output}\n[The End of Assistant's Answer]",
-                },
-            ],
-            temperature=0.0,
-        )
-        match = re.search(one_score_pattern, rating_response)
-        if not match:
-            match = re.search(one_score_pattern_backup, rating_response)
-
-        if match:
-            rating = ast.literal_eval(match.groups()[0])
-        else:
-            rating = 0
-
-        return rating / 10.0
-
-    return _eval_judge_llm
-
-
-judge_llm_gpt4 = judge_llm_factory("gpt-4")
-
-judge_llm_gpt3t = judge_llm_factory("gpt-3.5-turbo-16k")
-
-
-def self_check_gpt(log: Log) -> float:
-    """Measures how consistent is the output of a model under resampling the response."""
-    if log.configuration is None or log.configuration.messages is None:
-        return 0.0
-
-    messages = [m.to_dict() for m in log.configuration.messages]
-
-    n_sampled_outputs = 5
-    sampled_outputs = []
-    for _ in range(n_sampled_outputs):
-        response = call_openai(
-            messages=messages,
-            model=log.configuration.model,
-            temperature=1.0,
-            max_tokens=log.configuration.model_params.max_length,
-            top_p=log.configuration.model_params.top_p,
-            frequency_penalty=log.configuration.model_params.frequency_penalty,
-            presence_penalty=log.configuration.model_params.presence_penalty,
-        )
-        sampled_outputs.append(response)
-
-    sentences = sent_tokenize(log.output)
-
-    if len(sentences) == 0:
-        return 0.0
-
-    sentences_scores = []
-    for sentence in sentences:
-        scores = []
-        for sampled_output in sampled_outputs:
-            response = call_openai(
-                messages=[
-                    {
-                        "role": "user",
-                        "content": f"""Context: {sampled_output}
-Sentence: {sentence}
-Is the sentence supported by the context above?
-Answer Yes or No:""",
-                    }
-                ],
-                model="gpt-3.5-turbo",
-                temperature=0.0,
-            )
-            scores.append(float("yes" in response.lower()))
-        sentences_scores.append(sum(scores) / len(scores))
-
-    return sum(sentences_scores) / len(sentences_scores)
-
 
 def lm_vs_lm_factuality_factory(examiner_model: str = "gpt-3.5-turbo") -> Callable[[Log], float]:
     """Using an examining LLM, measures the factuality of a claim. Examining LLM asks follow-up questions to the other
@@ -171,5 +73,4 @@ def lm_vs_lm_factuality(log: Log) -> float:
 
 
 lm_vs_lm_factuality_gpt4 = lm_vs_lm_factuality_factory("gpt-4")
-
 lm_vs_lm_factuality_gpt3t = lm_vs_lm_factuality_factory("gpt-3.5-turbo-16k")
diff --git a/parea/evals/general/self_check.py b/parea/evals/general/self_check.py
@@ -0,0 +1,51 @@
+from parea.evals.utils import call_openai, sent_tokenize
+from parea.schemas.log import Log
+
+
+def self_check(log: Log) -> float:
+    """Measures how consistent is the output of a model under resampling the response."""
+    if log.configuration is None or log.configuration.messages is None:
+        return 0.0
+
+    messages = [m.to_dict() for m in log.configuration.messages]
+
+    n_sampled_outputs = 5
+    sampled_outputs = []
+    for _ in range(n_sampled_outputs):
+        response = call_openai(
+            messages=messages,
+            model=log.configuration.model,
+            temperature=1.0,
+            max_tokens=log.configuration.model_params.max_length,
+            top_p=log.configuration.model_params.top_p,
+            frequency_penalty=log.configuration.model_params.frequency_penalty,
+            presence_penalty=log.configuration.model_params.presence_penalty,
+        )
+        sampled_outputs.append(response)
+
+    sentences = sent_tokenize(log.output)
+
+    if len(sentences) == 0:
+        return 0.0
+
+    sentences_scores = []
+    for sentence in sentences:
+        scores = []
+        for sampled_output in sampled_outputs:
+            response = call_openai(
+                messages=[
+                    {
+                        "role": "user",
+                        "content": f"""Context: {sampled_output}
+Sentence: {sentence}
+Is the sentence supported by the context above?
+Answer Yes or No:""",
+                    }
+                ],
+                model="gpt-3.5-turbo",
+                temperature=0.0,
+            )
+            scores.append(float("yes" in response.lower()))
+        sentences_scores.append(sum(scores) / len(scores))
+
+    return sum(sentences_scores) / len(sentences_scores)