Skip to content

Commit

Permalink
Merge pull request #710 from parea-ai/PAI-979-evals-support-azure
Browse files Browse the repository at this point in the history
support azure in evals
  • Loading branch information
jalexanderII committed Apr 8, 2024
2 parents 3e65430 + b01b69a commit 98d114b
Show file tree
Hide file tree
Showing 19 changed files with 285 additions and 52 deletions.
157 changes: 157 additions & 0 deletions parea/cookbook/langchain/trace_langchain_azure_RAG_with_experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import os
from datetime import datetime
from operator import itemgetter

from dotenv import load_dotenv
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_community.document_transformers import Html2TextTransformer
from langchain_community.vectorstores.pinecone import Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from pinecone import Pinecone as PineconeClient

from parea import Parea, trace, trace_insert
from parea.evals.general import answer_matches_target_llm_grader_factory, answer_relevancy_factory
from parea.evals.rag import (
answer_context_faithfulness_binary_factory,
answer_context_faithfulness_statement_level_factory,
context_query_relevancy_factory,
percent_target_supported_by_context_factory,
)
from parea.utils.trace_integrations.langchain import PareaAILangchainTracer

load_dotenv()

p = Parea(api_key=os.getenv("PAREA_API_KEY"))
handler = PareaAILangchainTracer()

pinecone = PineconeClient(api_key=os.getenv("PINECONE_API_KEY"), environment=os.getenv("PINECONE_ENVIRONMENT"))

AZURE_MODEL = "gpt-model-0613"
AZURE_EMBEDDINGS = "text-embedding-3-small"


class DocumentRetriever:
def __init__(self, url: str):
api_loader = RecursiveUrlLoader(url)
raw_documents = api_loader.load()

# Transformer
doc_transformer = Html2TextTransformer()
transformed = doc_transformer.transform_documents(raw_documents)

# Splitter
text_splitter = TokenTextSplitter(
chunk_size=2000,
chunk_overlap=200,
)
documents = text_splitter.split_documents(transformed)

# Define vector store based
embeddings = AzureOpenAIEmbeddings(azure_deployment=AZURE_EMBEDDINGS)
vectorstore = Pinecone.from_documents(documents, embeddings, index_name=os.getenv("PINECONE_INDEX_NAME"))
self.retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

def get_retriever(self):
return self.retriever


class DocumentationChain:
def __init__(self, url):
retriever = DocumentRetriever(url).get_retriever()
model = AzureChatOpenAI(azure_deployment=AZURE_MODEL)
prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"You are a helpful documentation Q&A assistant, trained to answer questions from the provided context."
"\nThe current time is {time}.\n\nRelevant documents will be retrieved in the following messages.",
),
("system", "{context}"),
("human", "{question}"),
]
).partial(time=str(datetime.now()))

response_generator = prompt | model | StrOutputParser()

self.chain = {
"context": itemgetter("question") | retriever | self._format_docs,
"question": itemgetter("question"),
} | response_generator

def get_context(self) -> str:
"""Helper to get the context from a retrieval chain, so we can use it for evaluation metrics."""
return self.context

def _format_docs(self, docs) -> str:
context = "\n\n".join(doc.page_content for doc in docs)
# set context as an attribute, so we can access it later
self.context = context
return context[:2000]

def get_chain(self):
return self.chain


# EXAMPLE EVALUATION TEST CASES
eval_questions = [
"What is the population of New York City as of 2020?",
"Which borough of New York City has the highest population? Only respond with the name of the borough.",
"What is the economic significance of New York City?",
"How did New York City get its name?",
"What is the significance of the Statue of Liberty in New York City?",
]

eval_answers = [
"8,804,190",
"Brooklyn",
"""New York City's economic significance is vast, as it serves as the global financial capital, housing Wall
Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education,
and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting
global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries,
and educational institutions further fuel its economic prowess. The city's transportation network and global
influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural
epicenter.""",
"""New York City got its name when it came under British control in 1664. King Charles II of England granted the
lands to his brother, the Duke of York, who named the city New York in his own honor.""",
"""The Statue of Liberty in New York City holds great significance as a symbol of the United States and its
ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th
and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an
iconic landmark and a global symbol of cultural diversity and freedom.""",
]
# create a dataset of questions and targets
dataset = [{"question": q, "target": t} for q, t in zip(eval_questions, eval_answers)]


@trace(
eval_funcs=[
# these are factory functions that return the actual evaluation functions, so we need to call them
answer_matches_target_llm_grader_factory(model=AZURE_MODEL, is_azure=True),
answer_context_faithfulness_binary_factory(model=AZURE_MODEL, is_azure=True),
answer_relevancy_factory(model=AZURE_MODEL, embedding_model=AZURE_EMBEDDINGS, is_azure=True),
answer_context_faithfulness_statement_level_factory(model=AZURE_MODEL, is_azure=True),
context_query_relevancy_factory(context_fields=["context"], model=AZURE_MODEL, is_azure=True),
percent_target_supported_by_context_factory(context_fields=["context"], model=AZURE_MODEL, is_azure=True),
]
)
def main(question: str) -> str:
dc = DocumentationChain(url="https://en.wikipedia.org/wiki/New_York_City")
output = dc.get_chain().invoke(
{"question": question},
config={"callbacks": [handler]}, # pass the Parea callback handler to the chain
)
# insert the context into the trace as an input so that it can be referenced in the evaluation functions
# context needs to be retrieved after the chain is invoked
trace_insert({"inputs": {"context": dc.get_context()}})
print(output)
return output


if __name__ == "__main__":
p.experiment(
name="NYC_Wiki_RAG",
data=dataset,
func=main,
).run()
9 changes: 7 additions & 2 deletions parea/evals/chat/goal_success_ratio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
from parea.schemas.log import Log


def goal_success_ratio_factory(use_output: Optional[bool] = False, message_field: Optional[str] = None) -> Callable[[Log], float]:
def goal_success_ratio_factory(
use_output: Optional[bool] = False, message_field: Optional[str] = None, model: Optional[str] = "gpt-4", is_azure: Optional[bool] = False
) -> Callable[[Log], float]:
"""
This factory creates an evaluation function that measures the success ratio of a goal-oriented conversation.
Typically, a user interacts with a chatbot or AI assistant to achieve specific goals.
Expand All @@ -19,6 +21,8 @@ def goal_success_ratio_factory(use_output: Optional[bool] = False, message_field
3. Calculate the average number of messages sent per segment.
Args:
is_azure: Whether to use Azure as the model. Defaults to False.
model: The model which should be used for grading.
use_output (Optional[bool], optional): Whether to use the output of the log to access the messages. Defaults to False.
message_field (Optional[str], optional): The name of the field in the log that contains the messages.
Defaults to None. If None, the messages are taken from the configuration attribute.
Expand Down Expand Up @@ -56,7 +60,8 @@ def goal_success_ratio(log: Log) -> float:
}
]
+ messages[start_index:end_index],
model="gpt-4",
model=model,
is_azure=is_azure,
)

if user_follows_same_goal == "SAME_GOAL":
Expand Down
2 changes: 2 additions & 0 deletions parea/evals/general/answer_matches_target_llm_grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
def answer_matches_target_llm_grader_factory(
question_field: Optional[str] = "question",
model: Optional[str] = "gpt-4",
is_azure: Optional[bool] = False,
) -> Callable[[Log], Union[float, None]]:
"""Quantifies how much the generated answer matches the ground truth / target."""

Expand All @@ -31,6 +32,7 @@ def answer_matches_target_llm_grader(log: Log) -> Union[float, None]:
},
],
temperature=0.0,
is_azure=is_azure,
)
return float("yes" in response.lower())

Expand Down
20 changes: 15 additions & 5 deletions parea/evals/general/answer_relevancy.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
from typing import Callable
from typing import Callable, Optional

from parea.evals.utils import call_openai, embed
from parea.schemas.log import Log


def answer_relevancy_factory(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]:
def answer_relevancy_factory(
question_field: str = "question",
n_generations: int = 3,
model: Optional[str] = "gpt-3.5-turbo-16k",
embedding_model: str = "text-embedding-ada-002",
is_azure: Optional[bool] = False,
) -> Callable[[Log], float]:
"""
This factory creates an evaluation function that measures how relevant the generated response is to the given question.
It is based on the paper [RAGAS: Automated Evaluation of Retrieval Augmented Generation](https://arxiv.org/abs/2309.15217)
which suggests using an LLM to generate multiple questions that fit the generated answer and measure the cosine
similarity of the generated questions with the original one.
Args:
is_azure: Whether to use the Azure API. Defaults to False.
embedding_model: The model which should be used for embedding the text.
model: The model which should be used for grading. Defaults to "gpt-3.5-turbo-16k".
question_field: The key name/field used for the question/query of the user. Defaults to "question".
n_generations: The number of questions which should be generated. Defaults to 3.
Expand All @@ -33,7 +42,7 @@ def answer_relevancy(log: Log) -> float:
output = log.output

generated_questions = call_openai(
model="gpt-3.5-turbo-16k",
model=model,
messages=[
{
"role": "user",
Expand All @@ -48,9 +57,10 @@ def answer_relevancy(log: Log) -> float:
],
temperature=0.0,
n=n_generations,
is_azure=is_azure,
)
embedded_generated_questions = [embed(model="text-embedding-ada-002", input=q) for q in generated_questions]
embedded_question = embed(model="text-embedding-ada-002", input=question)
embedded_generated_questions = [embed(model=embedding_model, input=q, is_azure=is_azure) for q in generated_questions]
embedded_question = embed(model=embedding_model, input=question, is_azure=is_azure)

question_vec = np.asarray(embedded_question).reshape(1, -1)
gen_question_vec = np.asarray(embedded_generated_questions)
Expand Down
8 changes: 5 additions & 3 deletions parea/evals/general/llm_grader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Callable
from typing import Callable, Optional

import ast
import re
Expand All @@ -10,17 +10,18 @@
one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")


def llm_grader_factory(model: str, question_field: str = "question") -> Callable[[Log], float]:
def llm_grader_factory(model: str = "gpt-4", question_field: str = "question", is_azure: Optional[bool] = False) -> Callable[[Log], float]:
"""
This factory creates an evaluation function that uses an LLM to grade the response of an LLM to a given question.
It is based on the paper [Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena](https://arxiv.org/abs/2306.05685)
which intorduces general-purpose zero-shot prompt to rate responses from an LLM to a given question on a scale from 1-10.
which introduces general-purpose zero-shot prompt to rate responses from an LLM to a given question on a scale from 1-10.
They find that GPT-4's ratings agree as much with a human rater as a human annotator agrees with another one (>80%).
Further, they observe that the agreement with a human annotator increases as the response rating gets clearer.
Additionally, they investigated how much the evaluating LLM overestimated its responses and found that GPT-4 and
Claude-1 were the only models that didn't overestimate themselves.
Args:
is_azure: Whether to use the Azure API. Defaults to False.
model: The model which should be used for grading. Currently, only supports OpenAI chat models.
question_field: The key name/field used for the question/query of the user. Defaults to "question".
Expand Down Expand Up @@ -49,6 +50,7 @@ def llm_grader(log: Log) -> float:
},
],
temperature=0.0,
is_azure=is_azure,
)
match = re.search(one_score_pattern, rating_response)
if not match:
Expand Down
14 changes: 11 additions & 3 deletions parea/evals/general/lm_vs_lm.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from typing import Callable
from typing import Callable, Optional

from parea.evals.utils import call_openai
from parea.schemas.log import Log


def lm_vs_lm_factuality_factory(examiner_model: str = "gpt-3.5-turbo") -> Callable[[Log], float]:
def lm_vs_lm_factuality_factory(
examiner_model: str = "gpt-4",
is_azure: Optional[bool] = False,
) -> Callable[[Log], float]:
"""
This factory creates an evaluation function that measures the factuality of an LLM's response to a given question.
It is based on the paper [LM vs LM: Detecting Factual Errors via Cross Examination](https://arxiv.org/abs/2305.13281) which proposes using
Expand All @@ -14,6 +17,7 @@ def lm_vs_lm_factuality_factory(examiner_model: str = "gpt-3.5-turbo") -> Callab
model to say, "I don't know," if it is uncertain.
Args:
is_azure: Whether to use the Azure API. Defaults to False.
examiner_model: The model which will examine the original model. Currently, only supports OpenAI chat models.
Returns:
Expand All @@ -32,6 +36,7 @@ def lm_vs_lm_factuality(log: Log) -> float:
model=examiner_model,
messages=messages_examiner,
temperature=0.0,
is_azure=is_azure,
)
messages_examiner += [{"role": "assistant", "content": follow_up_questions}]
n_rounds_follow_up_questions = 1
Expand All @@ -42,14 +47,15 @@ def lm_vs_lm_factuality(log: Log) -> float:
while follow_up_questions is not None:
messages_examinee += [{"role": "user", "content": follow_up_questions}]
follow_up_answers = call_openai(
model=log.configuration.model,
model=examiner_model if is_azure else log.configuration.model,
messages=messages_examinee,
temperature=log.configuration.model_params.temp,
top_p=log.configuration.model_params.top_p,
frequency_penalty=log.configuration.model_params.frequency_penalty,
presence_penalty=log.configuration.model_params.presence_penalty,
max_tokens=log.configuration.model_params.max_length,
response_format=log.configuration.model_params.response_format,
is_azure=is_azure,
)
messages_examiner.append({"role": "assistant", "content": follow_up_answers})

Expand All @@ -63,6 +69,7 @@ def lm_vs_lm_factuality(log: Log) -> float:
model=examiner_model,
messages=messages_examiner,
temperature=0.0,
is_azure=is_azure,
)
messages_examiner += [{"role": "assistant", "content": examiner_response}]
if "yes" in examiner_response.lower():
Expand All @@ -80,6 +87,7 @@ def lm_vs_lm_factuality(log: Log) -> float:
model=examiner_model,
messages=messages_examiner,
temperature=0.0,
is_azure=is_azure,
)
return float("incorrect" not in examiner_response.lower())

Expand Down
14 changes: 8 additions & 6 deletions parea/evals/general/semantic_similarity.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
from typing import Callable, Union
from typing import Callable, Optional, Union

import numpy as np

from parea.evals.utils import embed
from parea.schemas import Log


def semantic_similarity_factory(embd_model: str = "text-embedding-3-small") -> Callable[[Log], Union[float, None]]:
def semantic_similarity_factory(
embd_model: str = "text-embedding-3-small",
is_azure: Optional[bool] = False,
) -> Callable[[Log], Union[float, None]]:
def semantic_similarity(log: Log) -> Union[float, None]:
"""Calculates semantic similarity between output and target"""
output = log.output
if (target := log.target) is None:
return None

output_vector = embed(model=embd_model, input=output)
target_vector = embed(model=embd_model, input=target)

output_vector = embed(model=embd_model, input=output, is_azure=is_azure)
target_vector = embed(model=embd_model, input=target, is_azure=is_azure)
output_vector = np.array(output_vector)
target_vector = np.array(target_vector)

Expand All @@ -24,6 +26,6 @@ def semantic_similarity(log: Log) -> Union[float, None]:
return semantic_similarity


semantic_similarity_oai_3_small = semantic_similarity_factory(embd_model="text-embedding-3-small")
semantic_similarity_oai_3_small = semantic_similarity_factory()
semantic_similarity_oai_3_large = semantic_similarity_factory(embd_model="text-embedding-3-large")
semantic_similarity_oai_ada_002 = semantic_similarity_factory(embd_model="text-embedding-ada-002")
Loading

0 comments on commit 98d114b

Please sign in to comment.