-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathevaluation.py
More file actions
115 lines (98 loc) · 3.46 KB
/
evaluation.py
File metadata and controls
115 lines (98 loc) · 3.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from langsmith import Client
from langsmith import evaluate
from pathlib import Path
import os
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
import uuid
env_path = Path('.') / 'secrets.env'
load_dotenv(dotenv_path=env_path)
openaiendpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
openaikey = os.getenv("AZURE_OPENAI_API_KEY")
openapideploymentname = os.getenv("AZURE_OPENAI_GPT4_DEPLOYMENT_NAME")
aiapiversion = os.getenv("AZURE_OPENAI_API_VERSION")
judge_llm = AzureChatOpenAI(
azure_deployment=openapideploymentname,
azure_endpoint=openaiendpoint,
openai_api_key=openaikey,
api_version=aiapiversion,
verbose=False,
temperature=0,
)
questions = [
"What was Microsoft\'s cloud revenue for 2024?",
"Did linkedin's revenue grow in 2024?"
]
answers = [
"Microsoft's cloud revenue for 2024 was $137.4 billion.",
"Yes, LinkedIn's revenue grew in 2024."
]
ls_client = Client()
ls_client.delete_dataset(dataset_name="Agent Evaluation") # Delete the dataset if it exists
dataset_name = "Agent Evaluation"
example_inputs = [
("RAG: What was Microsoft\'s cloud revenue for 2024?", "Microsoft's cloud revenue for 2024 was $137.4 billion."),
("RAG: Did linkedin's revenue grow in 2024?", "Yes, LinkedIn's revenue grew in 2024."),
]
dataset = ls_client.create_dataset(
dataset_name=dataset_name,
description="Dataset for evaluating the performance of the RAG agent.",
)
print("Dataset ID:: ", dataset.id)
print("Dataset Name:: ", dataset.name)
ls_client.create_examples(
dataset_id=dataset.id,
inputs=[{"question": q} for q, _ in example_inputs],
outputs=[{"answer": a} for _, a in example_inputs],
)
def get_thread_id():
"""
Function to get the thread ID.
Returns:
str: The thread ID.
"""
# Generate thread id if the chat history is empty
return str(uuid.uuid4())
config = {
"configurable": {
"thread_id": get_thread_id()
}
}
def correct(outputs: dict, reference_outputs: dict) -> bool:
instructions = (
"Given an actual answer and an expected answer, determine whether"
" the actual answer contains all of the information in the"
" expected answer. Respond with 'CORRECT' if the actual answer"
" does contain all of the expected information and 'INCORRECT'"
" otherwise. Do not include anything else in your response."
)
# Our graph outputs a State dictionary, which in this case means
# we'll have a 'messages' key and the final message should
# be our actual answer.
actual_answer = outputs["messages"][-1].content
expected_answer = reference_outputs["answer"]
user_msg = (
f"ACTUAL ANSWER: {actual_answer}"
f"\n\nEXPECTED ANSWER: {expected_answer}"
)
response = judge_llm.invoke(
[
{"role": "system", "content": instructions},
{"role": "user", "content": user_msg}
]
)
return response.content.upper() == "CORRECT"
def example_to_state(inputs: dict) -> dict:
return {"messages": [{"role": "user", "content": inputs['question']}]}
def evaluate_agents(app):
# We use LCEL declarative syntax here.
# Remember that langgraph graphs are also langchain runnables.
app.config = config
target = example_to_state | app
experiment_results = evaluate(
target,
data=dataset_name,
evaluators=[correct],
max_concurrency=4, # optional
experiment_prefix="Agentic-AI-Demo-Baseline", # optional
)