diff --git a/parea/evals/general.py b/parea/evals/general.py index 8867e722..279e63ab 100644 --- a/parea/evals/general.py +++ b/parea/evals/general.py @@ -10,11 +10,11 @@ one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]") -def judge_llm_factory(model: str) -> Callable[[Log], float]: +def judge_llm_factory(model: str, question_field: str = "question") -> Callable[[Log], float]: """Measures the generated response quality by using a LLM on a scale of 1 to 10.""" def _eval_judge_llm(log: Log) -> float: - question = log.inputs["question"] + question = log.inputs[question_field] output = log.output rating_response = call_openai( model=model, @@ -50,11 +50,14 @@ def _eval_judge_llm(log: Log) -> float: judge_llm_gpt4 = judge_llm_factory("gpt-4") -judge_llm_gpt3t = judge_llm_factory("gpt-3.5-turbo") +judge_llm_gpt3t = judge_llm_factory("gpt-3.5-turbo-16k") def self_check_gpt(log: Log) -> float: """Measures how consistent is the output of a model under resampling the response.""" + if log.configuration is None or log.configuration.messages is None: + return 0.0 + messages = [m.to_dict() for m in log.configuration.messages] n_sampled_outputs = 5 @@ -73,6 +76,9 @@ def self_check_gpt(log: Log) -> float: sentences = sent_tokenize(log.output) + if len(sentences) == 0: + return 0.0 + sentences_scores = [] for sentence in sentences: scores = [] @@ -105,7 +111,7 @@ def lm_vs_lm_factuality(log: Log) -> float: messages_examinee = [m.to_dict() for m in log.configuration.messages] # ask examiner for follow-up questions - setup_prompt = f"""Your goal is to try to verify the correctness of the following claim: {output}, based on the background information you will gather. To gather this, You will provide short questions whose purpose will be to verify the correctness of the claim, and I will reply to you with the answers to these. Hopefully, with the help of the background questions and their answers, you will be able to reach a conclusion as to whether the claim is correct or possibly incorrect. Please keep asking questions as long as you’re yet to be sure regarding the true veracity of the claim. Please start with the first questions.""" + setup_prompt = f"""Your goal is to try to verify the correctness of the following claim: "{output}", based on the background information you will gather. To gather this, You will provide short questions whose purpose will be to verify the correctness of the claim, and I will reply to you with the answers to these. Hopefully, with the help of the background questions and their answers, you will be able to reach a conclusion as to whether the claim is correct or possibly incorrect. Please keep asking questions as long as you’re yet to be sure regarding the true veracity of the claim. Please start with the first questions.""" messages_examiner = [{"role": "user", "content": setup_prompt}] follow_up_questions = call_openai( model=examiner_model, @@ -113,6 +119,7 @@ def lm_vs_lm_factuality(log: Log) -> float: temperature=0.0, ) messages_examiner += [{"role": "assistant", "content": follow_up_questions}] + n_rounds_follow_up_questions = 1 follow_up_prompt = """(i) Do you have any follow-up questions? Please answer with Yes or No. (ii) What are the follow-up questions?""" @@ -128,10 +135,13 @@ def lm_vs_lm_factuality(log: Log) -> float: presence_penalty=log.configuration.model_params.presence_penalty, max_tokens=log.configuration.model_params.max_length, ) - messages_examiner += [ - {"role": "assistant", "content": follow_up_answers}, - {"role": "user", "content": follow_up_prompt}, - ] + messages_examiner.append({"role": "assistant", "content": follow_up_answers}) + + if n_rounds_follow_up_questions > 3: + break + else: + messages_examiner.append({"role": "user", "content": follow_up_prompt}) + n_rounds_follow_up_questions += 1 examiner_response = call_openai( model=examiner_model, @@ -158,3 +168,8 @@ def lm_vs_lm_factuality(log: Log) -> float: return float("incorrect" not in examiner_response.lower()) return lm_vs_lm_factuality + + +lm_vs_lm_factuality_gpt4 = lm_vs_lm_factuality_factory("gpt-4") + +lm_vs_lm_factuality_gpt3t = lm_vs_lm_factuality_factory("gpt-3.5-turbo-16k") diff --git a/parea/evals/rag.py b/parea/evals/rag.py index 779da883..9fd1983b 100644 --- a/parea/evals/rag.py +++ b/parea/evals/rag.py @@ -72,7 +72,7 @@ def llm_critique_faithfulness(log: Log) -> float: def recall_response(log: Log) -> float: """Prop. of tokens in target/reference answer which are also in model generation.""" - target = log.inputs["target"] + target = log.target output = log.output provider = log.configuration.provider @@ -240,7 +240,7 @@ def ragas_answer_context_faithfulness(log: Log) -> float: return ragas_answer_context_faithfulness -def ragas_answer_relevancy_factor(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]: +def ragas_answer_relevancy_factory(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]: """Quantifies how much the generated answer relates to the query.""" try: import numpy as np @@ -290,8 +290,8 @@ def ragas_context_ranking_factory(question_field: str = "question", context_fiel def ragas_context_ranking(log: Log) -> float: """Quantifies if the retrieved context is ranked by their relevancy""" - contexts = [log.inputs[context_field] for context_field in context_fields] question = log.inputs[question_field] + contexts = [log.inputs[context_field] for context_field in context_fields] verifications = [] for context in contexts: diff --git a/parea/evals/utils.py b/parea/evals/utils.py index 02a3b968..af9401b3 100644 --- a/parea/evals/utils.py +++ b/parea/evals/utils.py @@ -61,6 +61,6 @@ def call_openai(messages, model, temperature=1.0, max_tokens=None, top_p=1.0, fr def embed(model, input) -> List[float]: if openai_version.startswith("0."): - return openai.Embedding.create(model=model, input=input, encoding_format="float")["embedding"] + return openai.Embedding.create(model=model, input=input, encoding_format="float").data[0]["embedding"] else: - return openai.embeddings.create(model=model, input=input, encoding_format="float").embedding + return openai.embeddings.create(model=model, input=input, encoding_format="float").data[0].embedding diff --git a/pyproject.toml b/pyproject.toml index 2b49d715..fc09985a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "parea-ai" packages = [{ include = "parea" }] -version = "0.2.17" +version = "0.2.18" description = "Parea python sdk" readme = "README.md" authors = ["joel-parea-ai "]