Skip to content

Commit

Permalink
Merge pull request #155 from parea-ai/PAI-335-add-eval-metrics-to-pyt…
Browse files Browse the repository at this point in the history
…hon-sdk-trace-decorator-local-benchmark

Pai 335 add eval metrics to python sdk trace decorator local benchmark
  • Loading branch information
joschkabraun committed Oct 20, 2023
2 parents bffcada + 68944a4 commit 568186e
Show file tree
Hide file tree
Showing 11 changed files with 297 additions and 32 deletions.
22 changes: 7 additions & 15 deletions parea/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from tqdm import tqdm

from parea.cache.redis import RedisCache
from parea.helpers import write_trace_logs_to_csv
from parea.schemas.models import TraceLog


Expand Down Expand Up @@ -50,8 +51,8 @@ def async_wrapper(fn, **kwargs):

def run_benchmark(args):
parser = argparse.ArgumentParser()
parser.add_argument("--func", help="Function to test e.g., path/to/my_code.py:argument_chain", type=str)
parser.add_argument("--csv_path", help="Path to the input CSV file", type=str)
parser.add_argument("--func", help="Function to test e.g., path/to/my_code.py:argument_chain", type=str, required=True)
parser.add_argument("--csv_path", help="Path to the input CSV file", type=str, required=True)
parser.add_argument("--redis_host", help="Redis host", type=str, default=os.getenv("REDIS_HOST", "localhost"))
parser.add_argument("--redis_port", help="Redis port", type=int, default=int(os.getenv("REDIS_PORT", 6379)))
parser.add_argument("--redis_password", help="Redis password", type=str, default=None)
Expand All @@ -69,23 +70,14 @@ def run_benchmark(args):
futures = [executor.submit(async_wrapper, fn, **data_input) for data_input in data_inputs]
else:
futures = [executor.submit(fn, **data_input) for data_input in data_inputs]
for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
pass
print(f"Done with {len(futures)} inputs")

redis_cache = RedisCache(key_logs=redis_logs_key)

trace_logs: list[TraceLog] = redis_cache.read_logs()
redis_cache = RedisCache(key_logs=redis_logs_key, host=args.redis_host, port=args.redis_port, password=args.redis_password)

# write to csv
path_csv = f"trace_logs-{int(time.time())}.csv"
with open(path_csv, "w", newline="") as file:
# write header
columns = fields_dict(TraceLog).keys()
writer = csv.DictWriter(file, fieldnames=columns)
writer.writeheader()
# write rows
for trace_log in trace_logs:
writer.writerow(asdict(trace_log))

trace_logs: list[TraceLog] = redis_cache.read_logs()
write_trace_logs_to_csv(path_csv, trace_logs)
print(f"Wrote CSV of results to: {path_csv}")
2 changes: 1 addition & 1 deletion parea/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __attrs_post_init__(self):
if self.api_key:
parea_logger.set_client(self._client)
if isinstance(self.cache, RedisCache):
parea_logger.set_redis_lru_cache(self.cache)
parea_logger.set_redis_cache(self.cache)
_init_parea_wrapper(logger_all_possible, self.cache)

def completion(self, data: Completion) -> CompletionResponse:
Expand Down
178 changes: 178 additions & 0 deletions parea/cookbook/tracing_and_evaluating_openai_endpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
from typing import Dict, List

import json
import os
import time

import openai
from dotenv import load_dotenv

from parea import RedisCache, init
from parea.helpers import write_trace_logs_to_csv
from parea.utils.trace_utils import get_current_trace_id, trace

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")


use_cache = True
cache = RedisCache() if use_cache else None
init(api_key=os.getenv("PAREA_API_KEY"), cache=cache)


def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float = 0.0) -> str:
return openai.ChatCompletion.create(model=model, temperature=temperature, messages=data).choices[0].message["content"]


def goal_success_ratio(inputs: Dict, output: str, target: str = None) -> float:
"""Returns the average amount of turns the user had to converse with the AI to reach their goals."""
output = json.loads(output)
# need to determine where does a new goal start
conversation_segments = []
start_index = 0
end_index = 3
while end_index < len(output):
user_follows_same_goal = call_llm(
[
{
"role": "system",
"content": "Look at the conversation and to determine if the user is still following the same goal "
"or if they are following a new goal. If they are following the same goal, respond "
"SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!",
}
]
+ output[start_index:end_index],
model="gpt-4",
)

if user_follows_same_goal == "SAME_GOAL":
end_index += 2
else:
conversation_segments.append(output[start_index : end_index - 1])
start_index = end_index - 1
end_index += 2

if start_index < len(output):
conversation_segments.append(output[start_index:])

# for now assume that the user reached their goal in every segment
# so we can return the average amount of turns the user had to converse with the AI to reach their goals
return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments)


def friendliness(inputs: Dict, output: str, target: str = None) -> float:
response = call_llm(
[
{"role": "system", "content": "You evaluate the friendliness of the following response on a scale of 0 to 10. You must only return a number."},
{"role": "assistant", "content": output},
],
model="gpt-4",
)
try:
return float(response) / 10.0
except TypeError:
return 0.0


def usefulness(inputs: Dict, output: str, target: str = None) -> float:
user_input = inputs["messages"][-1]["content"]
response = call_llm(
[
{"role": "system", "content": "You evaluate the usefulness of the response given the user input on a scale of 0 to 10. You must only return a number."},
{"role": "assistant", "content": f'''User input: "{user_input}"\nAssistant response: "{output}"'''},
],
model="gpt-4",
)
try:
return float(response) / 10.0
except TypeError:
return 0.0


@trace(eval_funcs=[friendliness, usefulness])
def helpful_the_second_time(messages: List[Dict[str, str]]) -> str:
helpful_response = call_llm(
[
{"role": "system", "content": "You are a friendly, and helpful assistant that helps people with their homework."},
]
+ messages,
model="gpt-4",
)

has_user_asked_before_raw = call_llm(
[
{
"role": "system",
"content": "Assess if the user has asked the last question before or is asking again for more \
information on a previous topic. If so, respond ASKED_BEFORE. Otherwise, respond NOT_ASKED_BEFORE.",
}
]
+ messages,
model="gpt-4",
)
has_user_asked_before = has_user_asked_before_raw == "ASKED_BEFORE"

if has_user_asked_before:
messages.append({"role": "assistant", "content": helpful_response})
return helpful_response
else:
unhelfpul_response = call_llm(
[
{
"role": "system",
"content": "Given the helpful response to the user input below, please provide a slightly unhelpful \
response which makes the user ask again in case they didn't ask already again because of a previous unhelpful answer. \
In case the user asked again, please provide a last response",
},
]
+ messages
+ [{"role": "assistant", "content": helpful_response}],
model="gpt-4",
)
messages.append({"role": "assistant", "content": unhelfpul_response})
return unhelfpul_response


@trace(eval_funcs=[goal_success_ratio], access_output_of_func=lambda x: x[0])
def unhelpful_chat():
print("Welcome to the chat! Type 'exit' to end the session.")

trace_id = get_current_trace_id()

messages = []
while True:
user_input = input("\nYou: ")

if user_input.lower() == "exit":
print("Goodbye!")
break

messages.append({"role": "user", "content": user_input})
print("Bot:", helpful_the_second_time(messages))

return messages, trace_id


def main():
_, trace_id = unhelpful_chat()

if os.getenv("PAREA_API_KEY"):
print(f"You can view the logs at: https://optimusprompt.ai/logs/detailed/{trace_id}")
if use_cache:
time.sleep(5) # wait for local eval function to finish
path_csv = f"trace_logs-{int(time.time())}.csv"
trace_logs = cache.read_logs()
write_trace_logs_to_csv(path_csv, trace_logs)
print(f"CSV-file of results: {path_csv}")
parent_trace = None
for trace_log in trace_logs:
if trace_log.trace_id == trace_id:
parent_trace = trace_log
break
if parent_trace:
print(f"Overall score(s):\n{json.dumps(parent_trace.scores)}")


if __name__ == "__main__":
main()
14 changes: 11 additions & 3 deletions parea/cookbook/tracing_with_open_ai_endpoint_directly.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from typing import Dict, Optional

import os
import random
from datetime import datetime

import openai
Expand All @@ -19,7 +22,12 @@ def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float
return openai.ChatCompletion.create(model=model, temperature=temperature, messages=data).choices[0].message["content"]


@trace
def random_eval(inputs: Dict[str, str], output, target: Optional[str] = None) -> float:
# return random number between 0 and 1
return random.random()


@trace(eval_funcs=[random_eval])
def argumentor(query: str, additional_description: str = "") -> str:
return call_llm(
[
Expand Down Expand Up @@ -48,7 +56,7 @@ def critic(argument: str) -> str:
)


@trace
@trace(eval_funcs=[random_eval])
def refiner(query: str, additional_description: str, argument: str, criticism: str) -> str:
return call_llm(
[
Expand All @@ -68,7 +76,7 @@ def refiner(query: str, additional_description: str, argument: str, criticism: s
)


@trace
@trace(eval_funcs=[random_eval], access_output_of_func=lambda x: x[0])
def argument_chain(query: str, additional_description: str = "") -> tuple[str, str]:
trace_id = get_current_trace_id()
argument = argumentor(query, additional_description)
Expand Down
16 changes: 16 additions & 0 deletions parea/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import csv
import time
import uuid

from attr import asdict, fields_dict

from parea.schemas.models import TraceLog


def gen_trace_id() -> str:
"""Generate a unique trace id for each chain of requests"""
Expand All @@ -13,3 +18,14 @@ def to_date_and_time_string(timestamp: float) -> str:

def date_and_time_string_to_timestamp(date_and_time_string: str) -> float:
return time.mktime(time.strptime(date_and_time_string, "%Y-%m-%d %H:%M:%S %Z"))


def write_trace_logs_to_csv(path_csv: str, trace_logs: list[TraceLog]):
with open(path_csv, "w", newline="") as file:
# write header
columns = fields_dict(TraceLog).keys()
writer = csv.DictWriter(file, fieldnames=columns)
writer.writeheader()
# write rows
for trace_log in trace_logs:
writer.writerow(asdict(trace_log))
10 changes: 5 additions & 5 deletions parea/parea_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
@define
class PareaLogger:
_client: HTTPClient = field(init=False, default=None)
_redis_lru_cache: RedisCache = field(init=False, default=None)
_redis_cache: RedisCache = field(init=False, default=None)

def set_client(self, client: HTTPClient) -> None:
self._client = client

def set_redis_lru_cache(self, cache: RedisCache) -> None:
self._redis_lru_cache = cache
def set_redis_cache(self, cache: RedisCache) -> None:
self._redis_cache = cache

def record_log(self, data: TraceLog) -> None:
self._client.request(
Expand All @@ -33,10 +33,10 @@ async def arecord_log(self, data: TraceLog) -> None:
)

def write_log(self, data: TraceLog) -> None:
self._redis_lru_cache.log(data)
self._redis_cache.log(data)

def default_log(self, data: TraceLog) -> None:
if self._redis_lru_cache:
if self._redis_cache:
self.write_log(data)
if self._client:
self.record_log(data)
Expand Down
10 changes: 9 additions & 1 deletion parea/schemas/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ class FeedbackRequest:
target: Optional[str] = None


@define
class NamedEvaluationScore:
name: str
score: float = field(validator=[validators.ge(0), validators.le(1)])


@define
class TraceLog:
trace_id: str
Expand All @@ -119,14 +125,16 @@ class TraceLog:
error: Optional[str] = None
status: Optional[str] = None
deployment_id: Optional[str] = None
evaluation_metric_ids: Optional[list[int]] = None
cache_hit: bool = False
configuration: LLMInputs = LLMInputs()
latency: Optional[float] = 0.0
input_tokens: Optional[int] = 0
output_tokens: Optional[int] = 0
total_tokens: Optional[int] = 0
cost: Optional[float] = 0.0
output_for_eval_metrics: Optional[str] = None
evaluation_metric_names: Optional[list[str]] = None
scores: Optional[list[NamedEvaluationScore]] = None
feedback_score: Optional[float] = None

# info filled from decorator
Expand Down
Loading

0 comments on commit 568186e

Please sign in to comment.