diff --git a/README.md b/README.md index 38c9307d..6d9c4ba0 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,8 @@ local CSV file if you don't have a Parea API key. Evaluation functions receive an argument `log` (of type [Log](parea/schemas/models.py)) and should return a float between 0 (bad) and 1 (good) inclusive. You don't need to start from scratch, there are pre-defined evaluation -functions for [general purpose](parea/evals/general.py), -[chat](parea/evals/chat.py), [RAG](parea/evals/rag.py), and [summarization](parea/evals/summary.py) apps :) +functions for [general purpose](parea/evals/general), +[chat](parea/evals/chat), [RAG](parea/evals/rag), and [summarization](parea/evals/summary) apps :) You can define evaluation functions locally or use the ones you have deployed to Parea's [Test Hub](https://app.parea.ai/test-hub). diff --git a/parea/evals/chat/__init__.py b/parea/evals/chat/__init__.py new file mode 100644 index 00000000..b1c7ceae --- /dev/null +++ b/parea/evals/chat/__init__.py @@ -0,0 +1 @@ +from .goal_success_ratio import goal_success_ratio_factory diff --git a/parea/evals/chat.py b/parea/evals/chat/goal_success_ratio.py similarity index 100% rename from parea/evals/chat.py rename to parea/evals/chat/goal_success_ratio.py diff --git a/parea/evals/general/__init__.py b/parea/evals/general/__init__.py new file mode 100644 index 00000000..366a8096 --- /dev/null +++ b/parea/evals/general/__init__.py @@ -0,0 +1,6 @@ +from .answer_matches_target_llm_grader import answer_matches_target_llm_grader_factory +from .answer_matches_target_recall import answer_matches_target_recall +from .answer_relevancy import answer_relevancy_factory +from .llm_grader import llm_grader_factory, llm_grader_gpt3t, llm_grader_gpt4 +from .lm_vs_lm import lm_vs_lm_factuality_factory, lm_vs_lm_factuality_gpt3t, lm_vs_lm_factuality_gpt4 +from .self_check import self_check diff --git a/parea/evals/general/answer_matches_target_llm_grader.py b/parea/evals/general/answer_matches_target_llm_grader.py new file mode 100644 index 00000000..6f0a8491 --- /dev/null +++ b/parea/evals/general/answer_matches_target_llm_grader.py @@ -0,0 +1,36 @@ +from typing import Callable, Optional + +from parea.evals.utils import call_openai +from parea.schemas.log import Log + + +def answer_matches_target_llm_grader_factory( + question_field: Optional[str] = "question", + model: Optional[str] = "gpt-4", +) -> Callable[[Log], float]: + """Quantifies how much the generated answer matches the ground truth / target.""" + + def answer_matches_target_llm_grader(log: Log) -> float: + question = log.inputs[question_field] + output = log.output + target = log.target + response = call_openai( + model=model, + messages=[ + {"role": "system", "content": "You are CompareGPT, a machine to verify the groundedness of predictions. Answer with " "only yes/no."}, + { + "role": "user", + "content": f"""You are given a question, the corresponding ground-truth answer and a prediction from a model. Compare the "Ground-truth answer" and the "Prediction" to determine whether the prediction correctly answers the question. All information in the ground-truth answer must be present in the prediction, including numbers and dates. You must answer "no" if there are any specific details in the ground-truth answer that are not mentioned in the prediction. There should be no contradicting statements in the prediction. The prediction may contain extra information. If the prediction states something as a possibility, treat it as a definitive answer. + +Question: {question} +Ground-truth answer: {target} +Prediction: {output} + +CompareGPT response:""", + }, + ], + temperature=0.0, + ) + return float("yes" in response.lower()) + + return answer_matches_target_llm_grader diff --git a/parea/evals/general/answer_matches_target_recall.py b/parea/evals/general/answer_matches_target_recall.py new file mode 100644 index 00000000..86c22140 --- /dev/null +++ b/parea/evals/general/answer_matches_target_recall.py @@ -0,0 +1,27 @@ +from collections import Counter + +from parea.schemas.log import Log + + +def answer_matches_target_recall(log: Log) -> float: + """Prop. of tokens in target/reference answer which are also in model generation.""" + target = log.target + output = log.output + + provider = log.configuration.provider + model = log.configuration.model + + if provider == "openai": + import tiktoken + + encoding = tiktoken.encoding_for_model(model) + target_tokens = encoding.encode(target) + output_tokens = encoding.encode(output) + else: + raise NotImplementedError + + if len(target_tokens) == 0: + return 1.0 + common_tokens = Counter(target_tokens) & Counter(output_tokens) + num_common = sum(common_tokens.values()) + return num_common / len(target_tokens) diff --git a/parea/evals/general/answer_relevancy.py b/parea/evals/general/answer_relevancy.py new file mode 100644 index 00000000..6d0e0ffb --- /dev/null +++ b/parea/evals/general/answer_relevancy.py @@ -0,0 +1,44 @@ +from typing import Callable + +from parea.evals.utils import call_openai, embed +from parea.schemas.log import Log + + +def answer_relevancy_factory(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]: + """Quantifies how much the generated answer relates to the query.""" + try: + import numpy as np + except ImportError: + raise ImportError("Please install numpy to use this metric.") + + def answer_relevancy(log: Log) -> float: + """Quantifies how much the generated answer relates to the query.""" + question = log.inputs[question_field] + output = log.output + + generated_questions = call_openai( + model="gpt-3.5-turbo-16k", + messages=[ + { + "role": "user", + "content": f"""\ +Generate question for the given answer. +Answer:\nThe PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India +Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from? + +Answer: {output} +Question:""", + } + ], + temperature=0.0, + n=n_generations, + ) + embedded_generated_questions = [embed(model="text-embedding-ada-002", input=q) for q in generated_questions] + embedded_question = embed(model="text-embedding-ada-002", input=question) + + question_vec = np.asarray(embedded_question).reshape(1, -1) + gen_question_vec = np.asarray(embedded_generated_questions) + norm = np.linalg.norm(gen_question_vec, axis=1) * np.linalg.norm(question_vec, axis=1) + return (np.dot(gen_question_vec, question_vec.T).reshape(-1) / norm).mean() + + return answer_relevancy diff --git a/parea/evals/general/llm_grader.py b/parea/evals/general/llm_grader.py new file mode 100644 index 00000000..aec5c3bd --- /dev/null +++ b/parea/evals/general/llm_grader.py @@ -0,0 +1,52 @@ +from typing import Callable + +import ast +import re + +from parea.evals.utils import call_openai +from parea.schemas.log import Log + +one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]") +one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]") + + +def llm_grader_factory(model: str, question_field: str = "question") -> Callable[[Log], float]: + """Measures the generated response quality by using a LLM on a scale of 1 to 10.""" + + def llm_grader(log: Log) -> float: + question = log.inputs[question_field] + output = log.output + rating_response = call_openai( + model=model, + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": f"[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response " + f"provided by an AI assistant to the user question displayed below. Your evaluation should " + f"consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and " + f"level of detail of the response. Begin your evaluation by providing a short explanation. " + f"Be as objective as possible. After providing your explanation, you must rate the response " + f'on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example: ' + f'"Rating: [[5]]".\n\n[Question]\n{question}\n\n[The Start of Assistant\'s Answer]' + f"\n{output}\n[The End of Assistant's Answer]", + }, + ], + temperature=0.0, + ) + match = re.search(one_score_pattern, rating_response) + if not match: + match = re.search(one_score_pattern_backup, rating_response) + + if match: + rating = ast.literal_eval(match.groups()[0]) + else: + rating = 0 + + return rating / 10.0 + + return llm_grader + + +llm_grader_gpt4 = llm_grader_factory("gpt-4") +llm_grader_gpt3t = llm_grader_factory("gpt-3.5-turbo-16k") diff --git a/parea/evals/general.py b/parea/evals/general/lm_vs_lm.py similarity index 51% rename from parea/evals/general.py rename to parea/evals/general/lm_vs_lm.py index 2913c6b5..394f6dc7 100644 --- a/parea/evals/general.py +++ b/parea/evals/general/lm_vs_lm.py @@ -1,106 +1,8 @@ from typing import Callable -import ast -import re - -from parea.evals.utils import call_openai, sent_tokenize +from parea.evals.utils import call_openai from parea.schemas.log import Log -one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]") -one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]") - - -def judge_llm_factory(model: str, question_field: str = "question") -> Callable[[Log], float]: - """Measures the generated response quality by using a LLM on a scale of 1 to 10.""" - - def _eval_judge_llm(log: Log) -> float: - question = log.inputs[question_field] - output = log.output - rating_response = call_openai( - model=model, - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": f"[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response " - f"provided by an AI assistant to the user question displayed below. Your evaluation should " - f"consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and " - f"level of detail of the response. Begin your evaluation by providing a short explanation. " - f"Be as objective as possible. After providing your explanation, you must rate the response " - f'on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example: ' - f'"Rating: [[5]]".\n\n[Question]\n{question}\n\n[The Start of Assistant\'s Answer]' - f"\n{output}\n[The End of Assistant's Answer]", - }, - ], - temperature=0.0, - ) - match = re.search(one_score_pattern, rating_response) - if not match: - match = re.search(one_score_pattern_backup, rating_response) - - if match: - rating = ast.literal_eval(match.groups()[0]) - else: - rating = 0 - - return rating / 10.0 - - return _eval_judge_llm - - -judge_llm_gpt4 = judge_llm_factory("gpt-4") - -judge_llm_gpt3t = judge_llm_factory("gpt-3.5-turbo-16k") - - -def self_check_gpt(log: Log) -> float: - """Measures how consistent is the output of a model under resampling the response.""" - if log.configuration is None or log.configuration.messages is None: - return 0.0 - - messages = [m.to_dict() for m in log.configuration.messages] - - n_sampled_outputs = 5 - sampled_outputs = [] - for _ in range(n_sampled_outputs): - response = call_openai( - messages=messages, - model=log.configuration.model, - temperature=1.0, - max_tokens=log.configuration.model_params.max_length, - top_p=log.configuration.model_params.top_p, - frequency_penalty=log.configuration.model_params.frequency_penalty, - presence_penalty=log.configuration.model_params.presence_penalty, - ) - sampled_outputs.append(response) - - sentences = sent_tokenize(log.output) - - if len(sentences) == 0: - return 0.0 - - sentences_scores = [] - for sentence in sentences: - scores = [] - for sampled_output in sampled_outputs: - response = call_openai( - messages=[ - { - "role": "user", - "content": f"""Context: {sampled_output} -Sentence: {sentence} -Is the sentence supported by the context above? -Answer Yes or No:""", - } - ], - model="gpt-3.5-turbo", - temperature=0.0, - ) - scores.append(float("yes" in response.lower())) - sentences_scores.append(sum(scores) / len(scores)) - - return sum(sentences_scores) / len(sentences_scores) - def lm_vs_lm_factuality_factory(examiner_model: str = "gpt-3.5-turbo") -> Callable[[Log], float]: """Using an examining LLM, measures the factuality of a claim. Examining LLM asks follow-up questions to the other @@ -171,5 +73,4 @@ def lm_vs_lm_factuality(log: Log) -> float: lm_vs_lm_factuality_gpt4 = lm_vs_lm_factuality_factory("gpt-4") - lm_vs_lm_factuality_gpt3t = lm_vs_lm_factuality_factory("gpt-3.5-turbo-16k") diff --git a/parea/evals/general/self_check.py b/parea/evals/general/self_check.py new file mode 100644 index 00000000..492b7911 --- /dev/null +++ b/parea/evals/general/self_check.py @@ -0,0 +1,51 @@ +from parea.evals.utils import call_openai, sent_tokenize +from parea.schemas.log import Log + + +def self_check(log: Log) -> float: + """Measures how consistent is the output of a model under resampling the response.""" + if log.configuration is None or log.configuration.messages is None: + return 0.0 + + messages = [m.to_dict() for m in log.configuration.messages] + + n_sampled_outputs = 5 + sampled_outputs = [] + for _ in range(n_sampled_outputs): + response = call_openai( + messages=messages, + model=log.configuration.model, + temperature=1.0, + max_tokens=log.configuration.model_params.max_length, + top_p=log.configuration.model_params.top_p, + frequency_penalty=log.configuration.model_params.frequency_penalty, + presence_penalty=log.configuration.model_params.presence_penalty, + ) + sampled_outputs.append(response) + + sentences = sent_tokenize(log.output) + + if len(sentences) == 0: + return 0.0 + + sentences_scores = [] + for sentence in sentences: + scores = [] + for sampled_output in sampled_outputs: + response = call_openai( + messages=[ + { + "role": "user", + "content": f"""Context: {sampled_output} +Sentence: {sentence} +Is the sentence supported by the context above? +Answer Yes or No:""", + } + ], + model="gpt-3.5-turbo", + temperature=0.0, + ) + scores.append(float("yes" in response.lower())) + sentences_scores.append(sum(scores) / len(scores)) + + return sum(sentences_scores) / len(sentences_scores) diff --git a/parea/evals/rag.py b/parea/evals/rag.py deleted file mode 100644 index 04fc4d5c..00000000 --- a/parea/evals/rag.py +++ /dev/null @@ -1,416 +0,0 @@ -from typing import Callable, List, Optional - -import re -from collections import Counter - -from parea.evals.utils import call_openai, embed, safe_json_loads, sent_tokenize -from parea.schemas.log import Log - - -def precision_response_context_factory(context_field: Optional[str] = "context") -> Callable[[Log], float]: - """Prop. of tokens in model generation which are also present in the retrieved context.""" - - def precision_response_context(log: Log) -> float: - """Prop. of tokens in model generation which are also present in the retrieved context.""" - context = log.inputs[context_field] - - provider = log.configuration.provider - model = log.configuration.model - - if provider == "openai": - import tiktoken - - encoding = tiktoken.encoding_for_model(model) - context_tokens = encoding.encode(context) - output_tokens = encoding.encode(log.output) - else: - raise NotImplementedError - - if len(context_tokens) == 0: - return 1.0 - elif len(output_tokens) == 0: - return 0.0 - - common_tokens = Counter(context_tokens) & Counter(output_tokens) - num_common = sum(common_tokens.values()) - return num_common / len(output_tokens) - - return precision_response_context - - -def llm_critique_faithfulness_factory( - question_field: Optional[str] = "question", - context_field: Optional[str] = "context", - model: Optional[str] = "gpt-4", -) -> Callable[[Log], float]: - """Quantifies how much the generated answer can be inferred from the retrieved context.""" - - def llm_critique_faithfulness(log: Log) -> float: - question = log.inputs[question_field] - evidence = log.inputs[context_field] - output = log.output - response = call_openai( - model=model, - messages=[ - {"role": "system", "content": "You are CompareGPT, a machine to verify the groundedness of predictions. Answer with " "only yes/no."}, - { - "role": "user", - "content": f"You are given a question, the corresponding evidence and a prediction from a model. Compare " - f'the "Prediction" and the "Evidence" to determine whether all the information of the ' - f"prediction in present in the evidence or can be inferred from the evidence. You must answer " - f'"no" if there are any specific details in the prediction that are not mentioned in the ' - f"evidence or cannot be inferred from the evidence.\n\n" - f"Question: {question}\n\nPrediction: {output}\n\nEvidence: {evidence}\n\nCompareGPT response:", - }, - ], - temperature=0.0, - ) - return float("yes" in response.lower()) - - return llm_critique_faithfulness - - -def recall_response(log: Log) -> float: - """Prop. of tokens in target/reference answer which are also in model generation.""" - target = log.target - output = log.output - - provider = log.configuration.provider - model = log.configuration.model - - if provider == "openai": - import tiktoken - - encoding = tiktoken.encoding_for_model(model) - target_tokens = encoding.encode(target) - output_tokens = encoding.encode(output) - else: - raise NotImplementedError - - if len(target_tokens) == 0: - return 1.0 - common_tokens = Counter(target_tokens) & Counter(output_tokens) - num_common = sum(common_tokens.values()) - return num_common / len(target_tokens) - - -def llm_critique_correctness_factory( - question_field: Optional[str] = "question", - model: Optional[str] = "gpt-4", -) -> Callable[[Log], float]: - """Quantifies how much the generated answer matches the ground truth / target.""" - - def llm_critique_correctness(log: Log) -> float: - question = log.inputs[question_field] - output = log.output - target = log.target - response = call_openai( - model=model, - messages=[ - {"role": "system", "content": "You are CompareGPT, a machine to verify the groundedness of predictions. Answer with " "only yes/no."}, - { - "role": "user", - "content": f"""You are given a question, the corresponding ground-truth answer and a prediction from a model. Compare the "Ground-truth answer" and the "Prediction" to determine whether the prediction correctly answers the question. All information in the ground-truth answer must be present in the prediction, including numbers and dates. You must answer "no" if there are any specific details in the ground-truth answer that are not mentioned in the prediction. There should be no contradicting statements in the prediction. The prediction may contain extra information. If the prediction states something as a possibility, treat it as a definitive answer. - -Question: {question} -Ground-truth answer: {target} -Prediction: {output} - -CompareGPT response:""", - }, - ], - temperature=0.0, - ) - return float("yes" in response.lower()) - - return llm_critique_correctness - - -def ragas_context_relevancy_factory(question_field: str = "question", context_fields: List[str] = ["context"]) -> Callable[[Log], float]: - """Quantifies how much the retrieved context relates to the query.""" - - def ragas_context_relevancy(log: Log) -> float: - """Quantifies how much the retrieved context relates to the query.""" - question = log.inputs[question_field] - context = "\n".join(log.inputs[context_field] for context_field in context_fields) - - extracted_sentences = call_openai( - model="gpt-3.5-turbo-16k", - messages=[ - { - "role": "user", - "content": f"""\ -Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context. - -question:{question} -context:\n{context} -candidate sentences:\n""", - } - ], - temperature=0.0, - ).strip() - if extracted_sentences.lower() == "insufficient information": - return 0.0 - else: - n_extracted_sentences = len(sent_tokenize(extracted_sentences)) - n_context_sentences = len(sent_tokenize(context)) - return n_extracted_sentences / n_context_sentences - - return ragas_context_relevancy - - -def ragas_answer_context_faithfulness_factory(question_field: str = "question", context_fields: List[str] = ["context"]) -> Callable[[Log], float]: - """Quantifies how much the generated answer can be inferred from the retrieved context.""" - - def ragas_answer_context_faithfulness(log: Log) -> float: - """Quantifies how much the generated answer can be inferred from the retrieved context.""" - question = log.inputs[question_field] - context = "\n".join(log.inputs[context_field] for context_field in context_fields) - output = log.output - - completion = call_openai( - model="gpt-3.5-turbo-16k", - messages=[ - { - "role": "user", - "content": f"""\ -Given a question and answer, create one or more statements from each sentence in the given answer. -question: Who was Albert Einstein and what is he best known for? -answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics. -statements:\nAlbert Einstein was born in Germany.\nAlbert Einstein was best known for his theory of relativity. -question: Cadmium Chloride is slightly soluble in this chemical, it is also called what? -answer: alcohol -statements:\nCadmium Chloride is slightly soluble in alcohol. -question: Were Shahul and Jithin of the same nationality? -answer: They were from different countries. -statements:\nShahul and Jithin were from different countries. -question:{question} -answer: {output} -statements:\n""", - } - ], - temperature=0.0, - ) - statements = completion.strip().split("\n") - statements_formatted = [f"{i+1}. {s.strip()}" for i, s in enumerate(statements)] - - verdicts = ( - call_openai( - model="gpt-3.5-turbo-16k", - messages=[ - { - "role": "user", - "content": f"""\ -Prompt: Natural language inference -Consider the given context and following statements, then determine whether they are supported by the information present in the context.Provide a brief explanation for each statement before arriving at the verdict (Yes/No). Provide a final verdict for each statement in order at the end in the given format. Do not deviate from the specified format. - -Context:\nJohn is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. -statements:\n1. John is majoring in Biology.\n2. John is taking a course on Artificial Intelligence.\n3. John is a dedicated student.\n4. John has a part-time job.\n5. John is interested in computer programming.\n -Answer: -1. John is majoring in Biology. -Explanation: John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology. Verdict: No. -2. John is taking a course on Artificial Intelligence. -Explanation: The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI. Verdict: No. -3. John is a dedicated student. -Explanation: The prompt states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication. Verdict: Yes. -4. John has a part-time job. -Explanation: There is no information given in the context about John having a part-time job. Therefore, it cannot be deduced that John has a part-time job. Verdict: No. -5. John is interested in computer programming. -Explanation: The context states that John is pursuing a degree in Computer Science, which implies an interest in computer programming. Verdict: Yes. -Final verdict for each statement in order: No. No. Yes. No. Yes. -context:\n{context} -statements:\n{statements_formatted} -Answer: -""", - } - ], - temperature=0.0, - ) - .lower() - .strip() - ) - final_answer = "Final verdict for each statement in order:".lower() - if final_answer in verdicts: - verdicts = verdicts[verdicts.find(final_answer) + len(final_answer) :] - yes_count = sum(0 if "yes" in answer else 1 for answer in verdicts.strip().split(".") if answer != "") - return yes_count / len(statements_formatted) - else: - return max(0, output.count("verdict: no")) / len(statements_formatted) - - return ragas_answer_context_faithfulness - - -def ragas_answer_relevancy_factory(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]: - """Quantifies how much the generated answer relates to the query.""" - try: - import numpy as np - except ImportError: - raise ImportError("Please install numpy to use this metric.") - - def ragas_answer_relevancy(log: Log) -> float: - """Quantifies how much the generated answer relates to the query.""" - question = log.inputs[question_field] - output = log.output - - generated_questions = call_openai( - model="gpt-3.5-turbo-16k", - messages=[ - { - "role": "user", - "content": f"""\ -Generate question for the given answer. -Answer:\nThe PSLV-C56 mission is scheduled to be launched on Sunday, 30 July 2023 at 06:30 IST / 01:00 UTC. It will be launched from the Satish Dhawan Space Centre, Sriharikota, Andhra Pradesh, India -Question: When is the scheduled launch date and time for the PSLV-C56 mission, and where will it be launched from? - -Answer: {output} -Question: -""", - } - ], - temperature=0.0, - n=n_generations, - ) - embedded_generated_questions = [embed(model="text-embedding-ada-002", input=q) for q in generated_questions] - embedded_question = embed(model="text-embedding-ada-002", input=question) - - question_vec = np.asarray(embedded_question).reshape(1, -1) - gen_question_vec = np.asarray(embedded_generated_questions) - norm = np.linalg.norm(gen_question_vec, axis=1) * np.linalg.norm(question_vec, axis=1) - return (np.dot(gen_question_vec, question_vec.T).reshape(-1) / norm).mean() - - return ragas_answer_relevancy - - -def ragas_context_ranking_factory(question_field: str = "question", context_fields: List[str] = ["context"], ranking_measurement="average_precision") -> Callable[[Log], float]: - """Quantifies if the retrieved context is ranked by their relevancy""" - try: - import numpy as np - except ImportError: - raise ImportError("Please install numpy to use this metric.") - - def ragas_context_ranking(log: Log) -> float: - """Quantifies if the retrieved context is ranked by their relevancy""" - question = log.inputs[question_field] - contexts = [log.inputs[context_field] for context_field in context_fields] - - verifications = [] - for context in contexts: - response = call_openai( - model="gpt-3.5-turbo-16k", - messages=[ - { - "role": "user", - "content": f"""\ -Verify if the information in the given context is useful in answering the question. - -question: What are the health benefits of green tea? -context: -This article explores the rich history of tea cultivation in China, tracing its roots back to the ancient dynasties. It discusses how different regions have developed their unique tea varieties and brewing techniques. The article also delves into the cultural significance of tea in Chinese society and how it has become a symbol of hospitality and relaxation. -verification: -{{"reason":"The context, while informative about the history and cultural significance of tea in China, does not provide specific information about the health benefits of green tea. Thus, it is not useful for answering the question about health benefits.", "verdict":"No"}} - -question: How does photosynthesis work in plants? -context: -Photosynthesis in plants is a complex process involving multiple steps. This paper details how chlorophyll within the chloroplasts absorbs sunlight, which then drives the chemical reaction converting carbon dioxide and water into glucose and oxygen. It explains the role of light and dark reactions and how ATP and NADPH are produced during these processes. -verification: -{{"reason":"This context is extremely relevant and useful for answering the question. It directly addresses the mechanisms of photosynthesis, explaining the key components and processes involved.", "verdict":"Yes"}} - -question:{question} -context: -{context} -verification:""", - } - ], - temperature=0.0, - ) - verifications.append(response) - - if ranking_measurement == "average_precision": - response = [safe_json_loads(item) for item in verifications] - response = [int("yes" in resp.get("verdict", " ").lower()) if resp.get("verdict") else np.nan for resp in response] - denominator = sum(response) + 1e-10 - numerator = sum([(sum(response[: i + 1]) / (i + 1)) * response[i] for i in range(len(response))]) - return numerator / denominator - else: - raise NotImplementedError - - return ragas_context_ranking - - -def ragas_percent_target_supported_by_context_factory(question_field: str = "question", context_fields: List[str] = ["context"]) -> Callable[[Log], float]: - """Quantifies how many sentences in the target/ground truth are supported by the retrieved context.""" - - def ragas_percent_target_supported_by_context(log: Log) -> float: - """Quantifies how many sentences in the target/ground truth are supported by the retrieved context.""" - question = log.inputs[question_field] - context = "\n".join(log.inputs[context_field] for context_field in context_fields) - target = log.target - - classification = call_openai( - model="gpt-3.5-turbo-16k", - messages=[ - { - "role": "user", - "content": f"""\ -Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Output json with reason. - - -question: What can you tell me about albert Albert Einstein? -context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. -answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 -classification: -[ - {{ - "statement_1":"Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.", - "reason": "The date of birth of Einstein is mentioned clearly in the context.", - "Attributed": "Yes" - }}, - {{ - "statement_2":"He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics.", - "reason": "The exact sentence is present in the given context.", - "Attributed": "Yes" - }}, - {{ - "statement_3": "He published 4 papers in 1905.", - "reason": "There is no mention about papers he wrote in the given context.", - "Attributed": "No" - }}, - {{ - "statement_4":"Einstein moved to Switzerland in 1895.", - "reason": "There is no supporting evidence for this in the given context.", - "Attributed": "No" - }} -] - -question: who won 2020 icc world cup? -context: Who won the 2022 ICC Men's T20 World Cup? -The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title. -answer: England -classification: -[ - {{ - "statement_1":"England won the 2022 ICC Men's T20 World Cup.", - "reason": "From context it is clear that England defeated Pakistan to win the World Cup.", - "Attributed": "Yes" - }} -] - -question: {question} -context: {context} -answer: {target} -classification: -""", - } - ], - temperature=0.0, - ) - pattern = "\[\s*\{.*?\}(\s*,\s*\{.*?\})*\s*\]" - match = re.search(pattern, classification.replace("\n", "")) - if match: - response = eval(classification) - numerator = sum(item.get("Attributed").lower() == "yes" for item in response) - return numerator / len(response) - else: - return 0.0 - - return ragas_percent_target_supported_by_context diff --git a/parea/evals/rag/__init__.py b/parea/evals/rag/__init__.py new file mode 100644 index 00000000..fec9007b --- /dev/null +++ b/parea/evals/rag/__init__.py @@ -0,0 +1,7 @@ +from .answer_context_faithfulness_binary import answer_context_faithfulness_binary_factory +from .answer_context_faithfulness_precision import answer_context_faithfulness_precision_factory +from .answer_context_faithfulness_statement_level import answer_context_faithfulness_statement_level_factory +from .context_query_relevancy import context_query_relevancy_factory +from .context_ranking_listwise import context_ranking_listwise_factory +from .context_ranking_pointwise import context_ranking_pointwise_factory +from .percent_target_supported_by_context import percent_target_supported_by_context_factory diff --git a/parea/evals/rag/answer_context_faithfulness_binary.py b/parea/evals/rag/answer_context_faithfulness_binary.py new file mode 100644 index 00000000..80aa6d57 --- /dev/null +++ b/parea/evals/rag/answer_context_faithfulness_binary.py @@ -0,0 +1,36 @@ +from typing import Callable, Optional + +from parea.evals.utils import call_openai +from parea.schemas.log import Log + + +def answer_context_faithfulness_binary_factory( + question_field: Optional[str] = "question", + context_field: Optional[str] = "context", + model: Optional[str] = "gpt-4", +) -> Callable[[Log], float]: + """Quantifies how much the generated answer can be inferred from the retrieved context.""" + + def answer_context_faithfulness_binary(log: Log) -> float: + question = log.inputs[question_field] + evidence = log.inputs[context_field] + output = log.output + response = call_openai( + model=model, + messages=[ + {"role": "system", "content": "You are CompareGPT, a machine to verify the groundedness of predictions. Answer with " "only yes/no."}, + { + "role": "user", + "content": f"You are given a question, the corresponding evidence and a prediction from a model. Compare " + f'the "Prediction" and the "Evidence" to determine whether all the information of the ' + f"prediction in present in the evidence or can be inferred from the evidence. You must answer " + f'"no" if there are any specific details in the prediction that are not mentioned in the ' + f"evidence or cannot be inferred from the evidence.\n\n" + f"Question: {question}\n\nPrediction: {output}\n\nEvidence: {evidence}\n\nCompareGPT response:", + }, + ], + temperature=0.0, + ) + return float("yes" in response.lower()) + + return answer_context_faithfulness_binary diff --git a/parea/evals/rag/answer_context_faithfulness_precision.py b/parea/evals/rag/answer_context_faithfulness_precision.py new file mode 100644 index 00000000..4b421c9f --- /dev/null +++ b/parea/evals/rag/answer_context_faithfulness_precision.py @@ -0,0 +1,36 @@ +from typing import Callable, Optional + +from collections import Counter + +from parea.schemas.log import Log + + +def answer_context_faithfulness_precision_factory(context_field: Optional[str] = "context") -> Callable[[Log], float]: + """Prop. of tokens in model generation which are also present in the retrieved context.""" + + def answer_context_faithfulness_precision(log: Log) -> float: + """Prop. of tokens in model generation which are also present in the retrieved context.""" + context = log.inputs[context_field] + + provider = log.configuration.provider + model = log.configuration.model + + if provider == "openai": + import tiktoken + + encoding = tiktoken.encoding_for_model(model) + context_tokens = encoding.encode(context) + output_tokens = encoding.encode(log.output) + else: + raise NotImplementedError + + if len(context_tokens) == 0: + return 1.0 + elif len(output_tokens) == 0: + return 0.0 + + common_tokens = Counter(context_tokens) & Counter(output_tokens) + num_common = sum(common_tokens.values()) + return num_common / len(output_tokens) + + return answer_context_faithfulness_precision diff --git a/parea/evals/rag/answer_context_faithfulness_statement_level.py b/parea/evals/rag/answer_context_faithfulness_statement_level.py new file mode 100644 index 00000000..ac38986f --- /dev/null +++ b/parea/evals/rag/answer_context_faithfulness_statement_level.py @@ -0,0 +1,85 @@ +from typing import Callable, List + +from parea.evals.utils import call_openai +from parea.schemas.log import Log + + +def answer_context_faithfulness_statement_level_factory(question_field: str = "question", context_fields: List[str] = ["context"]) -> Callable[[Log], float]: + """Quantifies how much the generated answer can be inferred from the retrieved context.""" + + def answer_context_faithfulness_statement_level(log: Log) -> float: + """Quantifies how much the generated answer can be inferred from the retrieved context.""" + question = log.inputs[question_field] + context = "\n".join(log.inputs[context_field] for context_field in context_fields) + output = log.output + + completion = call_openai( + model="gpt-3.5-turbo-16k", + messages=[ + { + "role": "user", + "content": f"""\ +Given a question and answer, create one or more statements from each sentence in the given answer. +question: Who was Albert Einstein and what is he best known for? +answer: He was a German-born theortical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics. +statements:\nAlbert Einstein was born in Germany.\nAlbert Einstein was best known for his theory of relativity. +question: Cadmium Chloride is slightly soluble in this chemical, it is also called what? +answer: alcohol +statements:\nCadmium Chloride is slightly soluble in alcohol. +question: Were Shahul and Jithin of thee same nationality? +answer: They were from different countries. +statements:\nShahul and Jithin were from different countries. +question:{question} +answer: {output} +statements:\n""", + } + ], + temperature=0.0, + ) + statements = completion.strip().split("\n") + statements_formatted = [f"{i+1}. {s.strip()}" for i, s in enumerate(statements)] + + verdicts = ( + call_openai( + model="gpt-3.5-turbo-16k", + messages=[ + { + "role": "user", + "content": f"""\ +Prompt: Natural language inference +Consider the given context and following statements, then determine whether they are supported by the information present in the context.Provide a brief explanation for each statement before arriving at the verdict (Yes/No). Provide a final verdict for each statement in order at the end in the given format. Do not deviate from the specified format. + +Context:\nJohn is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. +statements:\n1. John is majoring in Biology.\n2. John is taking a course on Artificial Intelligence.\n3. John is a dedicated student.\n4. John has a part-time job.\n5. John is interested in computer programming.\n +Answer: +1. John is majoring in Biology. +Explanation: John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology. Verdict: No. +2. John is taking a course on Artificial Intelligence. +Explanation: The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI. Verdict: No. +3. John is a dedicated student. +Explanation: The prompt states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication. Verdict: Yes. +4. John has a part-time job. +Explanation: There is no information given in the context about John having a part-time job. Therefore, it cannot be deduced that John has a part-time job. Verdict: No. +5. John is interested in computer programming. +Explanation: The context states that John is pursuing a degree in Computer Science, which implies an interest in computer programming. Verdict: Yes. +Final verdict for each statement in order: No. No. Yes. No. Yes. +context:\n{context} +statements:\n{statements_formatted} +Answer: +""", + } + ], + temperature=0.0, + ) + .lower() + .strip() + ) + final_answer = "Final verdict for each statement in order:".lower() + if final_answer in verdicts: + verdicts = verdicts[verdicts.find(final_answer) + len(final_answer) :] + yes_count = sum(0 if "yes" in answer else 1 for answer in verdicts.strip().split(".") if answer != "") + return yes_count / len(statements_formatted) + else: + return max(0, output.count("verdict: no")) / len(statements_formatted) + + return answer_context_faithfulness_statement_level diff --git a/parea/evals/rag/context_query_relevancy.py b/parea/evals/rag/context_query_relevancy.py new file mode 100644 index 00000000..ac07aaae --- /dev/null +++ b/parea/evals/rag/context_query_relevancy.py @@ -0,0 +1,37 @@ +from typing import Callable, List + +from parea.evals.utils import call_openai, sent_tokenize +from parea.schemas.log import Log + + +def context_query_relevancy_factory(question_field: str = "question", context_fields: List[str] = ["context"]) -> Callable[[Log], float]: + """Quantifies how much the retrieved context relates to the query.""" + + def context_query_relevancy(log: Log) -> float: + """Quantifies how much the retrieved context relates to the query.""" + question = log.inputs[question_field] + context = "\n".join(log.inputs[context_field] for context_field in context_fields) + + extracted_sentences = call_openai( + model="gpt-3.5-turbo-16k", + messages=[ + { + "role": "user", + "content": f"""\ +Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context. + +question:{question} +context:\n{context} +candidate sentences:\n""", + } + ], + temperature=0.0, + ).strip() + if extracted_sentences.lower() == "insufficient information": + return 0.0 + else: + n_extracted_sentences = len(sent_tokenize(extracted_sentences)) + n_context_sentences = len(sent_tokenize(context)) + return n_extracted_sentences / n_context_sentences + + return context_query_relevancy diff --git a/parea/evals/rag/context_ranking_listwise.py b/parea/evals/rag/context_ranking_listwise.py new file mode 100644 index 00000000..9290e989 --- /dev/null +++ b/parea/evals/rag/context_ranking_listwise.py @@ -0,0 +1,103 @@ +from typing import Callable, List + +from parea.evals.utils import call_openai, ndcg +from parea.schemas.log import Log + + +def context_ranking_listwise_factory( + question_field: str = "question", + context_fields: List[str] = ["context"], + ranking_measurement="ndcg", + n_contexts_to_rank=10, +) -> Callable[[Log], float]: + """Quantifies if the retrieved context is ranked by their relevancy by re-ranking the contexts. + + Paper: https://arxiv.org/abs/2305.02156 + + Args: + question_field (str): The name of the field in the log that contains the question. Defaults to "question". + context_fields (List[str]): The name of the fields in the log that contain the contexts. Defaults to ["context"]. + ranking_measurement (str): The measurement to use for ranking. Defaults to "ndcg". + n_contexts_to_rank (int): The number of contexts to rank listwise. Defaults to 10. + + Returns: + Callable[[Log], float]: A function that takes a log as input and returns a score between 0 and 1 indicating + how well the retrieved context is ranked by their relevancy. + + Raises: + ValueError: If n_contexts_to_rank is less than 1. + """ + if n_contexts_to_rank < 1: + raise ValueError("n_contexts_to_rank must be at least 1.") + + def listwise_reranking(query: str, contexts: List[str]) -> List[int]: + """Uses a LLM to listwise rerank the contexts. Returns the indices of the contexts in the order of their + relevance (most relevant to least relevant).""" + if len(contexts) == 0 or len(contexts) == 1: + return list(range(len(contexts))) + + prompt = "" + for i in range(len(contexts)): + prompt += f"Passage{i + 1} = {contexts[i]}\n" + + prompt += f"""Query = {query} + Passages = [Passage1, ..., Passage{len(contexts)}] + Sort the Passages by their relevance to the Query. + Sorted Passages = [""" + + sorted_list = call_openai( + messages=[ + { + "role": "user", + "content": prompt, + } + ], + model="gpt-3.5-turbo-16k", + temperature=0.0, + ) + + s = sorted_list.strip("[] ").replace(" ", "") + number_strings = s.split(",") + return [int(num) for num in number_strings if num.isdigit()] + + def progressive_reranking(query: str, contexts: List[str]) -> List[int]: + """Returns the indices of the contexts in the order of their relevance (most relevant to least relevant).""" + if len(contexts) <= n_contexts_to_rank: + return listwise_reranking(query, contexts) + + window_size = n_contexts_to_rank + window_step = n_contexts_to_rank // 2 + offset = len(contexts) - window_size + + indices = list(range(len(contexts))) + + while offset > 0: + window_contexts = contexts[offset : offset + window_size] + window_indices = indices[offset : offset + window_size] + reranked_indices = listwise_reranking(query, window_contexts) + contexts[offset : offset + window_size] = [window_contexts[i] for i in reranked_indices] + indices[offset : offset + window_size] = [window_indices[i] for i in reranked_indices] + + offset -= window_step + + window_contexts = contexts[:window_size] + window_indices = indices[:window_size] + reranked_indices = listwise_reranking(query, window_contexts) + contexts[:window_size] = [window_contexts[i] for i in reranked_indices] + indices[:window_size] = [window_indices[i] for i in reranked_indices] + + return indices + + def context_ranking(log: Log) -> float: + """Quantifies if the retrieved context is ranked by their relevancy by re-ranking the contexts.""" + question = log.inputs[question_field] + contexts = [log.inputs[context_field] for context_field in context_fields] + + reranked_indices = progressive_reranking(question, contexts) + + if ranking_measurement == "ndcg": + return ndcg(reranked_indices, list(range(len(contexts)))) + else: + raise NotImplementedError + + return context_ranking diff --git a/parea/evals/rag/context_ranking_pointwise.py b/parea/evals/rag/context_ranking_pointwise.py new file mode 100644 index 00000000..62a6c84c --- /dev/null +++ b/parea/evals/rag/context_ranking_pointwise.py @@ -0,0 +1,60 @@ +from typing import Callable, List + +from parea.evals.utils import call_openai, safe_json_loads +from parea.schemas.log import Log + + +def context_ranking_pointwise_factory(question_field: str = "question", context_fields: List[str] = ["context"], ranking_measurement="average_precision") -> Callable[[Log], float]: + """Quantifies if the retrieved context is ranked by their relevancy""" + try: + import numpy as np + except ImportError: + raise ImportError("Please install numpy to use this metric.") + + def context_ranking_pointwise(log: Log) -> float: + """Quantifies if the retrieved context is ranked by their relevancy""" + question = log.inputs[question_field] + contexts = [log.inputs[context_field] for context_field in context_fields] + + verifications = [] + for context in contexts: + response = call_openai( + model="gpt-3.5-turbo-16k", + messages=[ + { + "role": "user", + "content": f"""\ +Verify if the information in the given context is useful in answering the question. + +question: What are the health benefits of green tea? +context: +This article explores the rich history of tea cultivation in China, tracing its roots back to the ancient dynasties. It discusses how different regions have developed their unique tea varieties and brewing techniques. The article also delves into the cultural significance of tea in Chinese society and how it has become a symbol of hospitality and relaxation. +verification: +{{"reason":"The context, while informative about the history and cultural significance of tea in China, does not provide specific information about the health benefits of green tea. Thus, it is not useful for answering the question about health benefits.", "verdict":"No"}} + +question: How does photosynthesis work in plants? +context: +Photosynthesis in plants is a complex process involving multiple steps. This paper details how chlorophyll within the chloroplasts absorbs sunlight, which then drives the chemical reaction converting carbon dioxide and water into glucose and oxygen. It explains the role of light and dark reactions and how ATP and NADPH are produced during these processes. +verification: +{{"reason":"This context is extremely relevant and useful for answering the question. It directly addresses the mechanisms of photosynthesis, explaining the key components and processes involved.", "verdict":"Yes"}} + +question:{question} +context: +{context} +verification:""", + } + ], + temperature=0.0, + ) + verifications.append(response) + + if ranking_measurement == "average_precision": + response = [safe_json_loads(item) for item in verifications] + response = [int("yes" in resp.get("verdict", " ").lower()) if resp.get("verdict") else np.nan for resp in response] + denominator = sum(response) + 1e-10 + numerator = sum([(sum(response[: i + 1]) / (i + 1)) * response[i] for i in range(len(response))]) + return numerator / denominator + else: + raise NotImplementedError + + return context_ranking_pointwise diff --git a/parea/evals/rag/percent_target_supported_by_context.py b/parea/evals/rag/percent_target_supported_by_context.py new file mode 100644 index 00000000..8ac46bb0 --- /dev/null +++ b/parea/evals/rag/percent_target_supported_by_context.py @@ -0,0 +1,85 @@ +from typing import Callable, List + +import re + +from parea.evals.utils import call_openai +from parea.schemas.log import Log + + +def percent_target_supported_by_context_factory(question_field: str = "question", context_fields: List[str] = ["context"]) -> Callable[[Log], float]: + """Quantifies how many sentences in the target/ground truth are supported by the retrieved context.""" + + def percent_target_supported_by_context(log: Log) -> float: + """Quantifies how many sentences in the target/ground truth are supported by the retrieved context.""" + question = log.inputs[question_field] + context = "\n".join(log.inputs[context_field] for context_field in context_fields) + target = log.target + + classification = call_openai( + model="gpt-3.5-turbo-16k", + messages=[ + { + "role": "user", + "content": f"""\ +Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Output json with reason. + + +question: What can you tell me about albert Albert Einstein? +context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. +answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 +classification: +[ + {{ + "statement_1":"Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.", + "reason": "The date of birth of Einstein is mentioned clearly in the context.", + "Attributed": "Yes" + }}, + {{ + "statement_2":"He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics.", + "reason": "The exact sentence is present in the given context.", + "Attributed": "Yes" + }}, + {{ + "statement_3": "He published 4 papers in 1905.", + "reason": "There is no mention about papers he wrote in the given context.", + "Attributed": "No" + }}, + {{ + "statement_4":"Einstein moved to Switzerland in 1895.", + "reason": "There is no supporting evidence for this in the given context.", + "Attributed": "No" + }} +] + +question: who won 2020 icc world cup? +context: Who won the 2022 ICC Men's T20 World Cup? +The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title. +answer: England +classification: +[ + {{ + "statement_1":"England won the 2022 ICC Men's T20 World Cup.", + "reason": "From context it is clear that England defeated Pakistan to win the World Cup.", + "Attributed": "Yes" + }} +] + +question: {question} +context: {context} +answer: {target} +classification: +""", + } + ], + temperature=0.0, + ) + pattern = "\[\s*\{.*?\}(\s*,\s*\{.*?\})*\s*\]" + match = re.search(pattern, classification.replace("\n", "")) + if match: + response = eval(classification) + numerator = sum(item.get("Attributed").lower() == "yes" for item in response) + return numerator / len(response) + else: + return 0.0 + + return percent_target_supported_by_context diff --git a/parea/evals/summary.py b/parea/evals/summary.py deleted file mode 100644 index ece5523f..00000000 --- a/parea/evals/summary.py +++ /dev/null @@ -1,103 +0,0 @@ -from typing import Callable, Optional - -import re - -from parea.evals.utils import call_openai -from parea.schemas.models import Log - - -def factual_inconsistency_binary_factory( - article_field: Optional[str] = "article", - model: Optional[str] = "gpt-4", -) -> Callable[[Log], float]: - def factual_inconsistency_binary(log: Log) -> float: - article = log.inputs[article_field] - output = log.output - prompt = f"""Decide if the following summary is consistent with the corresponding article. Note that consistency means all information in the summary is supported by the article. - Article: {article} - Summary: {output} - Explain your reasoning step by step then answer (yes or no) the question:""" - response = call_openai( - model=model, - messages=[ - {"role": "user", "content": prompt}, - ], - temperature=0.0, - ) - return float("yes" in response.lower()) - - return factual_inconsistency_binary - - -def factual_inconsistency_scale_factory( - article_field: Optional[str] = "article", - model: Optional[str] = "gpt-4", -) -> Callable[[Log], float]: - def factual_inconsistency_scale(log: Log) -> float: - article = log.inputs[article_field] - output = log.output - prompt = f"""Score the following summary given the corresponding article with respect to consistency from 1 to 10. Note that consistency measures how much information included in the summary is present in the source article. 10 points indicate the summary contains only statements that are entailed by the source document. - Article: {article} - Summary: {output} - Marks: """ - response = call_openai( - model=model, - messages=[ - {"role": "user", "content": prompt}, - ], - temperature=0.0, - ) - - pattern = re.compile(r"\d+") - match = pattern.search(response) - if match: - score = match.group() - else: - score = 0 - - return float(score) / 10.0 - - return factual_inconsistency_scale - - -def likert_scale_eval_factory( - article_field: Optional[str] = "article", - model: Optional[str] = "gpt-4", -) -> Callable[[Log], float]: - def likert_scale_eval(log: Log) -> float: - article = log.inputs[article_field] - output = log.output - prompt = f"""Evaluate the quality of summaries written for a news article. Rate each summary on four dimensions: relevance, consistency, fluency, and coherence. You should rate on a scale from 1 (worst) to 5 (best). - -Definitions are as follows: -Relevance: The rating measures how well the summary captures the key points of the article. Consider whether all and only the important aspects are contained in the summary. -Consistency: The rating measures whether the facts in the summary are consistent with the facts in the original article. Consider whether the summary does reproduce all facts accurately and does not make up untrue information. -Fluency: This rating measures the quality of individual sentences, whether they are well-written and grammatically correct. Consider the quality of individual sentences. -Coherence: The rating measures the quality of all sentences collectively, to fit together and sound natural. Consider the quality of the summary as a whole. - -The article and the summary are given below: -Article: {article} -Summary: {output}""" - response = call_openai( - model=model, - messages=[ - {"role": "user", "content": prompt}, - ], - temperature=0.0, - ) - - # extract the scores - pattern = re.compile(r"\d+") - matches = pattern.findall(response) - if matches: - scores = matches - else: - scores = [0, 0, 0, 0] - - # normalize the scores - scores = [float(score) / 5.0 for score in scores] - - # average the scores - return sum(scores) / len(scores) - - return likert_scale_eval diff --git a/parea/evals/summary/__init__.py b/parea/evals/summary/__init__.py new file mode 100644 index 00000000..412a0be4 --- /dev/null +++ b/parea/evals/summary/__init__.py @@ -0,0 +1,3 @@ +from .factual_inconsistency_binary import factual_inconsistency_binary_factory +from .factual_inconsistency_scale import factual_inconsistency_scale_factory +from .likert_scale import likert_scale_factory diff --git a/parea/evals/summary/factual_inconsistency_binary.py b/parea/evals/summary/factual_inconsistency_binary.py new file mode 100644 index 00000000..c23d9849 --- /dev/null +++ b/parea/evals/summary/factual_inconsistency_binary.py @@ -0,0 +1,27 @@ +from typing import Callable, Optional + +from parea.evals.utils import call_openai +from parea.schemas.log import Log + + +def factual_inconsistency_binary_factory( + article_field: Optional[str] = "article", + model: Optional[str] = "gpt-4", +) -> Callable[[Log], float]: + def factual_inconsistency_binary(log: Log) -> float: + article = log.inputs[article_field] + output = log.output + prompt = f"""Decide if the following summary is consistent with the corresponding article. Note that consistency means all information in the summary is supported by the article. + Article: {article} + Summary: {output} + Explain your reasoning step by step then answer (yes or no) the question:""" + response = call_openai( + model=model, + messages=[ + {"role": "user", "content": prompt}, + ], + temperature=0.0, + ) + return float("yes" in response.lower()) + + return factual_inconsistency_binary diff --git a/parea/evals/summary/factual_inconsistency_scale.py b/parea/evals/summary/factual_inconsistency_scale.py new file mode 100644 index 00000000..a8a73e43 --- /dev/null +++ b/parea/evals/summary/factual_inconsistency_scale.py @@ -0,0 +1,37 @@ +from typing import Callable, Optional + +import re + +from parea.evals.utils import call_openai +from parea.schemas.log import Log + + +def factual_inconsistency_scale_factory( + article_field: Optional[str] = "article", + model: Optional[str] = "gpt-4", +) -> Callable[[Log], float]: + def factual_inconsistency_scale(log: Log) -> float: + article = log.inputs[article_field] + output = log.output + prompt = f"""Score the following summary given the corresponding article with respect to consistency from 1 to 10. Note that consistency measures how much information included in the summary is present in the source article. 10 points indicate the summary contains only statements that are entailed by the source document. + Article: {article} + Summary: {output} + Marks: """ + response = call_openai( + model=model, + messages=[ + {"role": "user", "content": prompt}, + ], + temperature=0.0, + ) + + pattern = re.compile(r"\d+") + match = pattern.search(response) + if match: + score = match.group() + else: + score = 0 + + return float(score) / 10.0 + + return factual_inconsistency_scale diff --git a/parea/evals/summary/likert_scale.py b/parea/evals/summary/likert_scale.py new file mode 100644 index 00000000..e705f3f9 --- /dev/null +++ b/parea/evals/summary/likert_scale.py @@ -0,0 +1,49 @@ +from typing import Callable, Optional + +import re + +from parea.evals.utils import call_openai +from parea.schemas.log import Log + + +def likert_scale_factory( + article_field: Optional[str] = "article", + model: Optional[str] = "gpt-4", +) -> Callable[[Log], float]: + def likert_scale(log: Log) -> float: + article = log.inputs[article_field] + output = log.output + prompt = f"""Evaluate the quality of summaries written for a news article. Rate each summary on four dimensions: relevance, consistency, fluency, and coherence. You should rate on a scale from 1 (worst) to 5 (best). + +Definitions are as follows: +Relevance: The rating measures how well the summary captures the key points of the article. Consider whether all and only the important aspects are contained in the summary. +Consistency: The rating measures whether the facts in the summary are consistent with the facts in the original article. Consider whether the summary does reproduce all facts accurately and does not make up untrue information. +Fluency: This rating measures the quality of individual sentences, whether they are well-written and grammatically correct. Consider the quality of individual sentences. +Coherence: The rating measures the quality of all sentences collectively, to fit together and sound natural. Consider the quality of the summary as a whole. + +The article and the summary are given below: +Article: {article} +Summary: {output}""" + response = call_openai( + model=model, + messages=[ + {"role": "user", "content": prompt}, + ], + temperature=0.0, + ) + + # extract the scores + pattern = re.compile(r"\d+") + matches = pattern.findall(response) + if matches: + scores = matches + else: + scores = [0, 0, 0, 0] + + # normalize the scores + scores = [float(score) / 5.0 for score in scores] + + # average the scores + return sum(scores) / len(scores) + + return likert_scale diff --git a/parea/evals/utils.py b/parea/evals/utils.py index af9401b3..0c5a98d6 100644 --- a/parea/evals/utils.py +++ b/parea/evals/utils.py @@ -64,3 +64,25 @@ def embed(model, input) -> List[float]: return openai.Embedding.create(model=model, input=input, encoding_format="float").data[0]["embedding"] else: return openai.embeddings.create(model=model, input=input, encoding_format="float").data[0].embedding + + +def dcg(y_true, ranking): + """Discounted cumulative gain (DCG) at rank k.""" + import numpy as np + + y_true = np.asarray(y_true) + ranking = np.asarray(ranking) + rel = y_true[ranking] + gains = 2**rel - 1 + discounts = np.log2(np.arange(len(ranking)) + 2) + return np.sum(gains / discounts) + + +def ndcg(y_true, ranking): + """Normalized discounted cumulative gain (NDCG) at rank k""" + import numpy as np + + k = len(ranking) + best_ranking = np.argsort(y_true)[::-1] + best = dcg(y_true, best_ranking[:k]) + return dcg(y_true, ranking) / best diff --git a/pyproject.toml b/pyproject.toml index bfbcad5c..9400acb0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "parea-ai" packages = [{ include = "parea" }] -version = "0.2.22" +version = "0.2.23" description = "Parea python sdk" readme = "README.md" authors = ["joel-parea-ai "]