Merge pull request #710 from parea-ai/PAI-979-evals-support-azure

support azure in evals
parea-ai · Apr 8, 2024 · 98d114b · 98d114b
2 parents 3e65430 + b01b69a
commit 98d114b
Show file tree

Hide file tree

Showing 19 changed files with 285 additions and 52 deletions.
diff --git a/parea/cookbook/langchain/trace_langchain_azure_RAG_with_experiment.py b/parea/cookbook/langchain/trace_langchain_azure_RAG_with_experiment.py
@@ -0,0 +1,157 @@
+import os
+from datetime import datetime
+from operator import itemgetter
+
+from dotenv import load_dotenv
+from langchain.text_splitter import TokenTextSplitter
+from langchain_community.document_loaders import RecursiveUrlLoader
+from langchain_community.document_transformers import Html2TextTransformer
+from langchain_community.vectorstores.pinecone import Pinecone
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
+from pinecone import Pinecone as PineconeClient
+
+from parea import Parea, trace, trace_insert
+from parea.evals.general import answer_matches_target_llm_grader_factory, answer_relevancy_factory
+from parea.evals.rag import (
+    answer_context_faithfulness_binary_factory,
+    answer_context_faithfulness_statement_level_factory,
+    context_query_relevancy_factory,
+    percent_target_supported_by_context_factory,
+)
+from parea.utils.trace_integrations.langchain import PareaAILangchainTracer
+
+load_dotenv()
+
+p = Parea(api_key=os.getenv("PAREA_API_KEY"))
+handler = PareaAILangchainTracer()
+
+pinecone = PineconeClient(api_key=os.getenv("PINECONE_API_KEY"), environment=os.getenv("PINECONE_ENVIRONMENT"))
+
+AZURE_MODEL = "gpt-model-0613"
+AZURE_EMBEDDINGS = "text-embedding-3-small"
+
+
+class DocumentRetriever:
+    def __init__(self, url: str):
+        api_loader = RecursiveUrlLoader(url)
+        raw_documents = api_loader.load()
+
+        # Transformer
+        doc_transformer = Html2TextTransformer()
+        transformed = doc_transformer.transform_documents(raw_documents)
+
+        # Splitter
+        text_splitter = TokenTextSplitter(
+            chunk_size=2000,
+            chunk_overlap=200,
+        )
+        documents = text_splitter.split_documents(transformed)
+
+        # Define vector store based
+        embeddings = AzureOpenAIEmbeddings(azure_deployment=AZURE_EMBEDDINGS)
+        vectorstore = Pinecone.from_documents(documents, embeddings, index_name=os.getenv("PINECONE_INDEX_NAME"))
+        self.retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
+
+    def get_retriever(self):
+        return self.retriever
+
+
+class DocumentationChain:
+    def __init__(self, url):
+        retriever = DocumentRetriever(url).get_retriever()
+        model = AzureChatOpenAI(azure_deployment=AZURE_MODEL)
+        prompt = ChatPromptTemplate.from_messages(
+            [
+                (
+                    "system",
+                    "You are a helpful documentation Q&A assistant, trained to answer questions from the provided context."
+                    "\nThe current time is {time}.\n\nRelevant documents will be retrieved in the following messages.",
+                ),
+                ("system", "{context}"),
+                ("human", "{question}"),
+            ]
+        ).partial(time=str(datetime.now()))
+
+        response_generator = prompt | model | StrOutputParser()
+
+        self.chain = {
+            "context": itemgetter("question") | retriever | self._format_docs,
+            "question": itemgetter("question"),
+        } | response_generator
+
+    def get_context(self) -> str:
+        """Helper to get the context from a retrieval chain, so we can use it for evaluation metrics."""
+        return self.context
+
+    def _format_docs(self, docs) -> str:
+        context = "\n\n".join(doc.page_content for doc in docs)
+        # set context as an attribute, so we can access it later
+        self.context = context
+        return context[:2000]
+
+    def get_chain(self):
+        return self.chain
+
+
+# EXAMPLE EVALUATION TEST CASES
+eval_questions = [
+    "What is the population of New York City as of 2020?",
+    "Which borough of New York City has the highest population? Only respond with the name of the borough.",
+    "What is the economic significance of New York City?",
+    "How did New York City get its name?",
+    "What is the significance of the Statue of Liberty in New York City?",
+]
+
+eval_answers = [
+    "8,804,190",
+    "Brooklyn",
+    """New York City's economic significance is vast, as it serves as the global financial capital, housing Wall
+    Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education,
+    and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting
+    global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries,
+    and educational institutions further fuel its economic prowess. The city's transportation network and global
+    influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural
+    epicenter.""",
+    """New York City got its name when it came under British control in 1664. King Charles II of England granted the
+    lands to his brother, the Duke of York, who named the city New York in his own honor.""",
+    """The Statue of Liberty in New York City holds great significance as a symbol of the United States and its
+    ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th
+    and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an
+    iconic landmark and a global symbol of cultural diversity and freedom.""",
+]
+# create a dataset of questions and targets
+dataset = [{"question": q, "target": t} for q, t in zip(eval_questions, eval_answers)]
+
+
+@trace(
+    eval_funcs=[
+        # these are factory functions that return the actual evaluation functions, so we need to call them
+        answer_matches_target_llm_grader_factory(model=AZURE_MODEL, is_azure=True),
+        answer_context_faithfulness_binary_factory(model=AZURE_MODEL, is_azure=True),
+        answer_relevancy_factory(model=AZURE_MODEL, embedding_model=AZURE_EMBEDDINGS, is_azure=True),
+        answer_context_faithfulness_statement_level_factory(model=AZURE_MODEL, is_azure=True),
+        context_query_relevancy_factory(context_fields=["context"], model=AZURE_MODEL, is_azure=True),
+        percent_target_supported_by_context_factory(context_fields=["context"], model=AZURE_MODEL, is_azure=True),
+    ]
+)
+def main(question: str) -> str:
+    dc = DocumentationChain(url="https://en.wikipedia.org/wiki/New_York_City")
+    output = dc.get_chain().invoke(
+        {"question": question},
+        config={"callbacks": [handler]},  # pass the Parea callback handler to the chain
+    )
+    # insert the context into the trace as an input so that it can be referenced in the evaluation functions
+    # context needs to be retrieved after the chain is invoked
+    trace_insert({"inputs": {"context": dc.get_context()}})
+    print(output)
+    return output
+
+
+if __name__ == "__main__":
+    p.experiment(
+        name="NYC_Wiki_RAG",
+        data=dataset,
+        func=main,
+    ).run()
diff --git a/parea/evals/chat/goal_success_ratio.py b/parea/evals/chat/goal_success_ratio.py
@@ -6,7 +6,9 @@
 from parea.schemas.log import Log
 
 
-def goal_success_ratio_factory(use_output: Optional[bool] = False, message_field: Optional[str] = None) -> Callable[[Log], float]:
+def goal_success_ratio_factory(
+    use_output: Optional[bool] = False, message_field: Optional[str] = None, model: Optional[str] = "gpt-4", is_azure: Optional[bool] = False
+) -> Callable[[Log], float]:
     """
     This factory creates an evaluation function that measures the success ratio of a goal-oriented conversation.
     Typically, a user interacts with a chatbot or AI assistant to achieve specific goals.
@@ -19,6 +21,8 @@ def goal_success_ratio_factory(use_output: Optional[bool] = False, message_field
     3. Calculate the average number of messages sent per segment.
 
     Args:
+        is_azure: Whether to use Azure as the model. Defaults to False.
+        model: The model which should be used for grading.
         use_output (Optional[bool], optional): Whether to use the output of the log to access the messages. Defaults to False.
         message_field (Optional[str], optional): The name of the field in the log that contains the messages.
             Defaults to None. If None, the messages are taken from the configuration attribute.
@@ -56,7 +60,8 @@ def goal_success_ratio(log: Log) -> float:
                     }
                 ]
                 + messages[start_index:end_index],
-                model="gpt-4",
+                model=model,
+                is_azure=is_azure,
             )
 
             if user_follows_same_goal == "SAME_GOAL":

diff --git a/parea/evals/general/answer_matches_target_llm_grader.py b/parea/evals/general/answer_matches_target_llm_grader.py
@@ -7,6 +7,7 @@
 def answer_matches_target_llm_grader_factory(
     question_field: Optional[str] = "question",
     model: Optional[str] = "gpt-4",
+    is_azure: Optional[bool] = False,
 ) -> Callable[[Log], Union[float, None]]:
     """Quantifies how much the generated answer matches the ground truth / target."""
 
@@ -31,6 +32,7 @@ def answer_matches_target_llm_grader(log: Log) -> Union[float, None]:
                 },
             ],
             temperature=0.0,
+            is_azure=is_azure,
         )
         return float("yes" in response.lower())
 

diff --git a/parea/evals/general/answer_relevancy.py b/parea/evals/general/answer_relevancy.py
@@ -1,17 +1,26 @@
-from typing import Callable
+from typing import Callable, Optional
 
 from parea.evals.utils import call_openai, embed
 from parea.schemas.log import Log
 
 
-def answer_relevancy_factory(question_field: str = "question", n_generations: int = 3) -> Callable[[Log], float]:
+def answer_relevancy_factory(
+    question_field: str = "question",
+    n_generations: int = 3,
+    model: Optional[str] = "gpt-3.5-turbo-16k",
+    embedding_model: str = "text-embedding-ada-002",
+    is_azure: Optional[bool] = False,
+) -> Callable[[Log], float]:
     """
     This factory creates an evaluation function that measures how relevant the generated response is to the given question.
     It is based on the paper [RAGAS: Automated Evaluation of Retrieval Augmented Generation](https://arxiv.org/abs/2309.15217)
     which suggests using an LLM to generate multiple questions that fit the generated answer and measure the cosine
     similarity of the generated questions with the original one.
 
     Args:
+        is_azure: Whether to use the Azure API. Defaults to False.
+        embedding_model: The model which should be used for embedding the text.
+        model: The model which should be used for grading. Defaults to "gpt-3.5-turbo-16k".
         question_field: The key name/field used for the question/query of the user. Defaults to "question".
         n_generations: The number of questions which should be generated. Defaults to 3.
 
@@ -33,7 +42,7 @@ def answer_relevancy(log: Log) -> float:
         output = log.output
 
         generated_questions = call_openai(
-            model="gpt-3.5-turbo-16k",
+            model=model,
             messages=[
                 {
                     "role": "user",
@@ -48,9 +57,10 @@ def answer_relevancy(log: Log) -> float:
             ],
             temperature=0.0,
             n=n_generations,
+            is_azure=is_azure,
         )
-        embedded_generated_questions = [embed(model="text-embedding-ada-002", input=q) for q in generated_questions]
-        embedded_question = embed(model="text-embedding-ada-002", input=question)
+        embedded_generated_questions = [embed(model=embedding_model, input=q, is_azure=is_azure) for q in generated_questions]
+        embedded_question = embed(model=embedding_model, input=question, is_azure=is_azure)
 
         question_vec = np.asarray(embedded_question).reshape(1, -1)
         gen_question_vec = np.asarray(embedded_generated_questions)

diff --git a/parea/evals/general/llm_grader.py b/parea/evals/general/llm_grader.py
@@ -1,4 +1,4 @@
-from typing import Callable
+from typing import Callable, Optional
 
 import ast
 import re
@@ -10,17 +10,18 @@
 one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")
 
 
-def llm_grader_factory(model: str, question_field: str = "question") -> Callable[[Log], float]:
+def llm_grader_factory(model: str = "gpt-4", question_field: str = "question", is_azure: Optional[bool] = False) -> Callable[[Log], float]:
     """
     This factory creates an evaluation function that uses an LLM to grade the response of an LLM to a given question.
     It is based on the paper [Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena](https://arxiv.org/abs/2306.05685)
-    which intorduces general-purpose zero-shot prompt to rate responses from an LLM to a given question on a scale from 1-10.
+    which introduces general-purpose zero-shot prompt to rate responses from an LLM to a given question on a scale from 1-10.
     They find that GPT-4's ratings agree as much with a human rater as a human annotator agrees with another one (>80%).
     Further, they observe that the agreement with a human annotator increases as the response rating gets clearer.
     Additionally, they investigated how much the evaluating LLM overestimated its responses and found that GPT-4 and
     Claude-1 were the only models that didn't overestimate themselves.
 
     Args:
+        is_azure: Whether to use the Azure API. Defaults to False.
         model: The model which should be used for grading. Currently, only supports OpenAI chat models.
         question_field: The key name/field used for the question/query of the user. Defaults to "question".
 
@@ -49,6 +50,7 @@ def llm_grader(log: Log) -> float:
                 },
             ],
             temperature=0.0,
+            is_azure=is_azure,
         )
         match = re.search(one_score_pattern, rating_response)
         if not match:

diff --git a/parea/evals/general/lm_vs_lm.py b/parea/evals/general/lm_vs_lm.py
@@ -1,10 +1,13 @@
-from typing import Callable
+from typing import Callable, Optional
 
 from parea.evals.utils import call_openai
 from parea.schemas.log import Log
 
 
-def lm_vs_lm_factuality_factory(examiner_model: str = "gpt-3.5-turbo") -> Callable[[Log], float]:
+def lm_vs_lm_factuality_factory(
+    examiner_model: str = "gpt-4",
+    is_azure: Optional[bool] = False,
+) -> Callable[[Log], float]:
     """
     This factory creates an evaluation function that measures the factuality of an LLM's response to a given question.
     It is based on the paper [LM vs LM: Detecting Factual Errors via Cross Examination](https://arxiv.org/abs/2305.13281) which proposes using
@@ -14,6 +17,7 @@ def lm_vs_lm_factuality_factory(examiner_model: str = "gpt-3.5-turbo") -> Callab
     model to say, "I don't know," if it is uncertain.
 
     Args:
+        is_azure: Whether to use the Azure API. Defaults to False.
         examiner_model: The model which will examine the original model. Currently, only supports OpenAI chat models.
 
     Returns:
@@ -32,6 +36,7 @@ def lm_vs_lm_factuality(log: Log) -> float:
             model=examiner_model,
             messages=messages_examiner,
             temperature=0.0,
+            is_azure=is_azure,
         )
         messages_examiner += [{"role": "assistant", "content": follow_up_questions}]
         n_rounds_follow_up_questions = 1
@@ -42,14 +47,15 @@ def lm_vs_lm_factuality(log: Log) -> float:
         while follow_up_questions is not None:
             messages_examinee += [{"role": "user", "content": follow_up_questions}]
             follow_up_answers = call_openai(
-                model=log.configuration.model,
+                model=examiner_model if is_azure else log.configuration.model,
                 messages=messages_examinee,
                 temperature=log.configuration.model_params.temp,
                 top_p=log.configuration.model_params.top_p,
                 frequency_penalty=log.configuration.model_params.frequency_penalty,
                 presence_penalty=log.configuration.model_params.presence_penalty,
                 max_tokens=log.configuration.model_params.max_length,
                 response_format=log.configuration.model_params.response_format,
+                is_azure=is_azure,
             )
             messages_examiner.append({"role": "assistant", "content": follow_up_answers})
 
@@ -63,6 +69,7 @@ def lm_vs_lm_factuality(log: Log) -> float:
                 model=examiner_model,
                 messages=messages_examiner,
                 temperature=0.0,
+                is_azure=is_azure,
             )
             messages_examiner += [{"role": "assistant", "content": examiner_response}]
             if "yes" in examiner_response.lower():
@@ -80,6 +87,7 @@ def lm_vs_lm_factuality(log: Log) -> float:
             model=examiner_model,
             messages=messages_examiner,
             temperature=0.0,
+            is_azure=is_azure,
         )
         return float("incorrect" not in examiner_response.lower())
 

diff --git a/parea/evals/general/semantic_similarity.py b/parea/evals/general/semantic_similarity.py
@@ -1,21 +1,23 @@
-from typing import Callable, Union
+from typing import Callable, Optional, Union
 
 import numpy as np
 
 from parea.evals.utils import embed
 from parea.schemas import Log
 
 
-def semantic_similarity_factory(embd_model: str = "text-embedding-3-small") -> Callable[[Log], Union[float, None]]:
+def semantic_similarity_factory(
+    embd_model: str = "text-embedding-3-small",
+    is_azure: Optional[bool] = False,
+) -> Callable[[Log], Union[float, None]]:
     def semantic_similarity(log: Log) -> Union[float, None]:
         """Calculates semantic similarity between output and target"""
         output = log.output
         if (target := log.target) is None:
             return None
 
-        output_vector = embed(model=embd_model, input=output)
-        target_vector = embed(model=embd_model, input=target)
-
+        output_vector = embed(model=embd_model, input=output, is_azure=is_azure)
+        target_vector = embed(model=embd_model, input=target, is_azure=is_azure)
         output_vector = np.array(output_vector)
         target_vector = np.array(target_vector)
 
@@ -24,6 +26,6 @@ def semantic_similarity(log: Log) -> Union[float, None]:
     return semantic_similarity
 
 
-semantic_similarity_oai_3_small = semantic_similarity_factory(embd_model="text-embedding-3-small")
+semantic_similarity_oai_3_small = semantic_similarity_factory()
 semantic_similarity_oai_3_large = semantic_similarity_factory(embd_model="text-embedding-3-large")
 semantic_similarity_oai_ada_002 = semantic_similarity_factory(embd_model="text-embedding-ada-002")