Skip to content

Commit

Permalink
Merge pull request #252 from parea-ai/PAI-446-add-eval-metric-cookbook
Browse files Browse the repository at this point in the history
feat: update cookbook for chat
  • Loading branch information
joschkabraun committed Dec 4, 2023
2 parents b134eb1 + 4719496 commit 615f1c4
Show file tree
Hide file tree
Showing 9 changed files with 154 additions and 94 deletions.
2 changes: 1 addition & 1 deletion parea/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from importlib import metadata as importlib_metadata

from parea.benchmark import run_benchmark
from parea.cache import RedisCache
from parea.cache import InMemoryCache, RedisCache
from parea.client import Parea, init


Expand Down
1 change: 1 addition & 0 deletions parea/cache/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .in_memory import InMemoryCache
from .redis import RedisCache
24 changes: 23 additions & 1 deletion parea/cache/cache.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import List, Optional

from abc import ABC

Expand Down Expand Up @@ -81,3 +81,25 @@ async def ainvalidate(self, key: CacheRequest):
# noqa: DAR401
"""
raise NotImplementedError

def log(self, value: TraceLog):
"""
Log a response in the cache.
Args:
value (TraceLog): The response to log.
# noqa: DAR401
"""
raise NotImplementedError

def read_logs(self) -> List[TraceLog]:
"""
Read all logs from the cache.
Returns:
List[TraceLog]: All logs in the cache.
# noqa: DAR401
"""
raise NotImplementedError
40 changes: 40 additions & 0 deletions parea/cache/in_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from typing import List, Optional

import json

from attr import asdict

from parea.cache.cache import Cache
from parea.schemas.models import CacheRequest, TraceLog


class InMemoryCache(Cache):
def __init__(self):
self.cache = {}
self.logs = []

def get(self, key: CacheRequest) -> Optional[TraceLog]:
return self.cache.get(json.dumps(asdict(key)))

async def aget(self, key: CacheRequest) -> Optional[TraceLog]:
return self.get(key)

def set(self, key: CacheRequest, value: TraceLog):
self.cache[json.dumps(asdict(key))] = value

async def aset(self, key: CacheRequest, value: TraceLog):
self.set(key, value)

def invalidate(self, key: CacheRequest):
key = json.dumps(asdict(key))
if key in self.cache:
del self.cache[key]

async def ainvalidate(self, key: CacheRequest):
self.invalidate(key)

def log(self, value: TraceLog):
self.logs.append(value)

def read_logs(self) -> List[TraceLog]:
return self.logs.copy()
4 changes: 2 additions & 2 deletions parea/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from attrs import asdict, define, field

from parea.api_client import HTTPClient
from parea.cache import InMemoryCache, RedisCache
from parea.cache.cache import Cache
from parea.cache.redis import RedisCache
from parea.helpers import gen_trace_id
from parea.parea_logger import parea_logger
from parea.schemas.models import Completion, CompletionResponse, FeedbackRequest, UseDeployedPrompt, UseDeployedPromptResponse
Expand All @@ -30,7 +30,7 @@ def __attrs_post_init__(self):
self._client.set_api_key(self.api_key)
if self.api_key:
parea_logger.set_client(self._client)
if isinstance(self.cache, RedisCache):
if isinstance(self.cache, (RedisCache, InMemoryCache)):
parea_logger.set_redis_cache(self.cache)
_init_parea_wrapper(logger_all_possible, self.cache)

Expand Down
75 changes: 22 additions & 53 deletions parea/cookbook/tracing_and_evaluating_openai_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@
import time

import openai
from attr import asdict
from dotenv import load_dotenv

from parea import RedisCache, init
from parea import InMemoryCache, init
from parea.evals.chat import goal_success_ratio_factory
from parea.evals.utils import call_openai
from parea.helpers import write_trace_logs_to_csv
from parea.schemas.models import Log
from parea.utils.trace_utils import get_current_trace_id, trace

load_dotenv()
Expand All @@ -17,52 +21,13 @@


use_cache = True
cache = RedisCache() if use_cache else None
cache = InMemoryCache() if use_cache else None
init(api_key=os.getenv("PAREA_API_KEY"), cache=cache)


def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float = 0.0) -> str:
return openai.ChatCompletion.create(model=model, temperature=temperature, messages=data).choices[0].message["content"]


def goal_success_ratio(inputs: Dict, output: str, target: str = None) -> float:
"""Returns the average amount of turns the user had to converse with the AI to reach their goals."""
output = json.loads(output)
# need to determine where does a new goal start
conversation_segments = []
start_index = 0
end_index = 3
while end_index < len(output):
user_follows_same_goal = call_llm(
[
{
"role": "system",
"content": "Look at the conversation and to determine if the user is still following the same goal "
"or if they are following a new goal. If they are following the same goal, respond "
"SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!",
}
]
+ output[start_index:end_index],
model="gpt-4",
)

if user_follows_same_goal == "SAME_GOAL":
end_index += 2
else:
conversation_segments.append(output[start_index : end_index - 1])
start_index = end_index - 1
end_index += 2

if start_index < len(output):
conversation_segments.append(output[start_index:])

# for now assume that the user reached their goal in every segment
# so we can return the average amount of turns the user had to converse with the AI to reach their goals
return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments)


def friendliness(inputs: Dict, output: str, target: str = None) -> float:
response = call_llm(
def friendliness(log: Log) -> float:
output = log.output
response = call_openai(
[
{"role": "system", "content": "You evaluate the friendliness of the following response on a scale of 0 to 10. You must only return a number."},
{"role": "assistant", "content": output},
Expand All @@ -75,9 +40,10 @@ def friendliness(inputs: Dict, output: str, target: str = None) -> float:
return 0.0


def usefulness(inputs: Dict, output: str, target: str = None) -> float:
user_input = inputs["messages"][-1]["content"]
response = call_llm(
def usefulness(log: Log) -> float:
user_input = log.inputs["messages"][-1]["content"]
output = log.output
response = call_openai(
[
{"role": "system", "content": "You evaluate the usefulness of the response given the user input on a scale of 0 to 10. You must only return a number."},
{"role": "assistant", "content": f'''User input: "{user_input}"\nAssistant response: "{output}"'''},
Expand All @@ -92,15 +58,15 @@ def usefulness(inputs: Dict, output: str, target: str = None) -> float:

@trace(eval_funcs=[friendliness, usefulness])
def helpful_the_second_time(messages: List[Dict[str, str]]) -> str:
helpful_response = call_llm(
helpful_response = call_openai(
[
{"role": "system", "content": "You are a friendly, and helpful assistant that helps people with their homework."},
]
+ messages,
model="gpt-4",
)

has_user_asked_before_raw = call_llm(
has_user_asked_before_raw = call_openai(
[
{
"role": "system",
Expand All @@ -117,7 +83,7 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str:
messages.append({"role": "assistant", "content": helpful_response})
return helpful_response
else:
unhelfpul_response = call_llm(
unhelfpul_response = call_openai(
[
{
"role": "system",
Expand All @@ -134,9 +100,12 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str:
return unhelfpul_response


goal_success_ratio = goal_success_ratio_factory(use_output=True)


@trace(eval_funcs=[goal_success_ratio], access_output_of_func=lambda x: x[0])
def unhelpful_chat():
print("Welcome to the chat! Type 'exit' to end the session.")
print("\nWelcome to the somewhat helpful chat! Type 'exit' to end the session.")

trace_id = get_current_trace_id()

Expand Down Expand Up @@ -164,14 +133,14 @@ def main():
path_csv = f"trace_logs-{int(time.time())}.csv"
trace_logs = cache.read_logs()
write_trace_logs_to_csv(path_csv, trace_logs)
print(f"CSV-file of results: {path_csv}")
print(f"\nCSV-file of traces: {path_csv}")
parent_trace = None
for trace_log in trace_logs:
if trace_log.trace_id == trace_id:
parent_trace = trace_log
break
if parent_trace:
print(f"Overall score(s):\n{json.dumps(parent_trace.scores)}")
print(f"Overall score(s):\n{json.dumps(parent_trace.scores, default=asdict, indent=2)}")


if __name__ == "__main__":
Expand Down
90 changes: 57 additions & 33 deletions parea/evals/chat.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,64 @@
from typing import Callable, Optional

import json

from parea.evals.utils import call_openai
from parea.schemas.models import Log


def goal_success_ratio(log: Log) -> float:
"""Returns the average amount of turns the user had to converse with the AI to reach their goals."""
messages = [m.to_dict() for m in log.configuration.messages]
messages.append({"role": "assistant", "content": log.output})

# need to determine where does a new goal start
conversation_segments = []
start_index = 0
end_index = 3
while end_index < len(messages):
user_follows_same_goal = call_openai(
[
{
"role": "system",
"content": "Look at the conversation and to determine if the user is still following the same goal "
"or if they are following a new goal. If they are following the same goal, respond "
"SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!",
}
]
+ messages[start_index:end_index],
model="gpt-4",
)

if user_follows_same_goal == "SAME_GOAL":
end_index += 2
def goal_success_ratio_factory(use_output: Optional[bool] = False, message_field: Optional[str] = None) -> Callable[[Log], float]:
"""Factory function that returns a function that calculates the goal success ratio of a log.
Args:
use_output (Optional[bool], optional): Whether to use the output of the log to access the messages. Defaults to False.
message_field (Optional[str], optional): The name of the field in the log that contains the messages.
Defaults to None. If None, the messages are taken from the configuration attribute.
"""
if use_output and message_field:
raise ValueError("Only one of use_output and message_field can be set.")

def goal_success_ratio(log: Log) -> float:
"""Returns the average amount of turns the user had to converse with the AI to reach their goals."""
if use_output:
output_list_dicts = json.loads(log.output)
messages = [m for m in output_list_dicts]
elif message_field:
messages = [m for m in log.inputs[message_field]]
else:
conversation_segments.append(messages[start_index : end_index - 1])
start_index = end_index - 1
end_index += 2
messages = [m.to_dict() for m in log.configuration.messages]
if log.output:
messages.append({"role": "assistant", "content": log.output})

# need to determine where does a new goal start
conversation_segments = []
start_index = 0
end_index = 3
while end_index < len(messages):
user_follows_same_goal = call_openai(
[
{
"role": "system",
"content": "Look at the conversation and to determine if the user is still following the same goal "
"or if they are following a new goal. If they are following the same goal, respond "
"SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!",
}
]
+ messages[start_index:end_index],
model="gpt-4",
)

if user_follows_same_goal == "SAME_GOAL":
end_index += 2
else:
conversation_segments.append(messages[start_index : end_index - 1])
start_index = end_index - 1
end_index += 2

if start_index < len(messages):
conversation_segments.append(messages[start_index:])

if start_index < len(messages):
conversation_segments.append(messages[start_index:])
# for now assume that the user reached their goal in every segment
# return the average amount of turns the user had to converse with the AI to reach their goals
return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments)

# for now assume that the user reached their goal in every segment
# return the average amount of turns the user had to converse with the AI to reach their goals
return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments)
return goal_success_ratio
10 changes: 7 additions & 3 deletions parea/utils/trace_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,23 +179,27 @@ def logger_all_possible(trace_id: str):
def call_eval_funcs_then_log(trace_id: str, eval_funcs: list[Callable] = None, access_output_of_func: Callable = None):
data = trace_data.get()[trace_id]
try:
inputs = data.inputs
target = data.target
if eval_funcs and data.status == "success":
if access_output_of_func:
output = json.loads(data.output)
output = access_output_of_func(output)
output_for_eval_metrics = json_dumps(output)
else:
output_for_eval_metrics = data.output

data.output_for_eval_metrics = output_for_eval_metrics
output_old = data.output
data.output = data.output_for_eval_metrics
data.scores = []

for func in eval_funcs:
try:
score = func(inputs=inputs, output=output_for_eval_metrics, target=target)
score = func(data)
data.scores.append(NamedEvaluationScore(name=func.__name__, score=score))
except Exception as e:
logger.exception(f"Error occurred calling evaluation function '{func.__name__}', {e}", exc_info=e)

data.output = output_old
except Exception as e:
logger.exception(f"Error occurred in when trying to evaluate output, {e}", exc_info=e)
parea_logger.default_log(data=data)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "parea-ai"
packages = [{ include = "parea" }]
version = "0.2.19"
version = "0.2.20"
description = "Parea python sdk"
readme = "README.md"
authors = ["joel-parea-ai <[email protected]>"]
Expand Down

0 comments on commit 615f1c4

Please sign in to comment.