From f6801bf2eca6a3234d3e0381d981e49e533f5fd0 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Tue, 17 Oct 2023 11:08:00 +0200 Subject: [PATCH 01/13] feat: attach evaluation function via trace decorator --- parea/schemas/models.py | 9 +++++++- parea/utils/trace_utils.py | 43 +++++++++++++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/parea/schemas/models.py b/parea/schemas/models.py index 089ace63..d91001d0 100644 --- a/parea/schemas/models.py +++ b/parea/schemas/models.py @@ -109,6 +109,12 @@ class FeedbackRequest: target: Optional[str] = None +@define +class NamedEvaluationScore: + name: str + score: float = field(validator=[validators.ge(0), validators.le(1)]) + + @define class TraceLog: trace_id: str @@ -119,7 +125,6 @@ class TraceLog: error: Optional[str] = None status: Optional[str] = None deployment_id: Optional[str] = None - evaluation_metric_ids: Optional[list[int]] = None cache_hit: bool = False configuration: LLMInputs = LLMInputs() latency: Optional[float] = 0.0 @@ -127,6 +132,8 @@ class TraceLog: output_tokens: Optional[int] = 0 total_tokens: Optional[int] = 0 cost: Optional[float] = 0.0 + evaluation_metric_ids: Optional[list[int]] = None + named_evaluation_scores: Optional[list[NamedEvaluationScore]] = None feedback_score: Optional[float] = None # info filled from decorator diff --git a/parea/utils/trace_utils.py b/parea/utils/trace_utils.py index d44f89b4..b15de456 100644 --- a/parea/utils/trace_utils.py +++ b/parea/utils/trace_utils.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Union +from typing import Any, Optional, Union, Callable import contextvars import inspect @@ -13,7 +13,7 @@ from parea.helpers import gen_trace_id, to_date_and_time_string from parea.parea_logger import parea_logger -from parea.schemas.models import CompletionResponse, TraceLog +from parea.schemas.models import CompletionResponse, TraceLog, NamedEvaluationScore logger = logging.getLogger() @@ -55,6 +55,8 @@ def trace( metadata: Optional[dict[str, Any]] = None, target: Optional[str] = None, end_user_identifier: Optional[str] = None, + eval_funcs: Optional[list[Callable]] = None, + access_output_of_func: Optional[Callable] = None, ): def init_trace(func_name, args, kwargs, func) -> tuple[str, float]: start_time = time.time() @@ -87,7 +89,7 @@ def cleanup_trace(trace_id, start_time): end_time = time.time() trace_data.get()[trace_id].end_timestamp = to_date_and_time_string(end_time) trace_data.get()[trace_id].latency = end_time - start_time - logger_all_possible(trace_id) + thread_eval_funcs_then_log(trace_id, eval_funcs, access_output_of_func) trace_context.get().pop() def decorator(func): @@ -98,7 +100,8 @@ async def async_wrapper(*args, **kwargs): try: result = await func(*args, **kwargs) output = make_output(result, output_as_list) - trace_data.get()[trace_id].output = json.dumps(output) + trace_data.get()[trace_id].output = output if isinstance(output, str) else json.dumps(output) + trace_data.get()[trace_id].status = "success" except Exception as e: logger.exception(f"Error occurred in function {func.__name__}, {e}") trace_data.get()[trace_id].error = str(e) @@ -115,7 +118,8 @@ def wrapper(*args, **kwargs): try: result = func(*args, **kwargs) output = make_output(result, output_as_list) - trace_data.get()[trace_id].output = json.dumps(output) + trace_data.get()[trace_id].output = output if isinstance(output, str) else json.dumps(output) + trace_data.get()[trace_id].status = "success" except Exception as e: logger.exception(f"Error occurred in function {func.__name__}, {e}") trace_data.get()[trace_id].error = str(e) @@ -169,3 +173,32 @@ def logger_all_possible(trace_id: str): kwargs={"data": trace_data.get()[trace_id]}, ) logging_thread.start() + + +def call_eval_funcs_then_log(trace_id: str, eval_funcs: list[Callable] = None, access_output_of_func: Callable = None): + data = trace_data.get()[trace_id] + try: + inputs = data.inputs + output = data.output + if access_output_of_func: + output = access_output_of_func(output) + target = data.target + if eval_funcs and data.status == "success": + data.named_evaluation_scores = [] + for func in eval_funcs: + try: + score = func(inputs=inputs, output=output, target=target) + data.named_evaluation_scores.append(NamedEvaluationScore(name=func.__name__, score=score)) + except Exception as e: + logger.exception(f"Error occurred calling evaluation function '{func.__name__}', {e}", exc_info=e) + except Exception as e: + logger.exception(f"Error occurred in when trying to evaluate output, {e}", exc_info=e) + parea_logger.default_log(data=data) + + +def thread_eval_funcs_then_log(trace_id: str, eval_funcs: list[Callable] = None, access_output_of_func: Callable = None): + logging_thread = threading.Thread( + target=call_eval_funcs_then_log, + kwargs={"trace_id": trace_id, "eval_funcs": eval_funcs, "access_output_of_func": access_output_of_func}, + ) + logging_thread.start() From 4f3d78f536c8276a9693373ad02f005940f27651 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Tue, 17 Oct 2023 11:08:31 +0200 Subject: [PATCH 02/13] refactor: remove lru naming from redis cache --- parea/benchmark.py | 6 +++--- parea/client.py | 2 +- parea/parea_logger.py | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/parea/benchmark.py b/parea/benchmark.py index a236b1cc..a736bd5f 100644 --- a/parea/benchmark.py +++ b/parea/benchmark.py @@ -50,8 +50,8 @@ def async_wrapper(fn, **kwargs): def run_benchmark(args): parser = argparse.ArgumentParser() - parser.add_argument("--func", help="Function to test e.g., path/to/my_code.py:argument_chain", type=str) - parser.add_argument("--csv_path", help="Path to the input CSV file", type=str) + parser.add_argument("--func", help="Function to test e.g., path/to/my_code.py:argument_chain", type=str, required=True) + parser.add_argument("--csv_path", help="Path to the input CSV file", type=str, required=True) parser.add_argument("--redis_host", help="Redis host", type=str, default=os.getenv("REDIS_HOST", "localhost")) parser.add_argument("--redis_port", help="Redis port", type=int, default=int(os.getenv("REDIS_PORT", 6379))) parser.add_argument("--redis_password", help="Redis password", type=str, default=None) @@ -69,7 +69,7 @@ def run_benchmark(args): futures = [executor.submit(async_wrapper, fn, **data_input) for data_input in data_inputs] else: futures = [executor.submit(fn, **data_input) for data_input in data_inputs] - for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)): + for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)): pass print(f"Done with {len(futures)} inputs") diff --git a/parea/client.py b/parea/client.py index dc8b1d4c..f0043dfb 100644 --- a/parea/client.py +++ b/parea/client.py @@ -31,7 +31,7 @@ def __attrs_post_init__(self): if self.api_key: parea_logger.set_client(self._client) if isinstance(self.cache, RedisCache): - parea_logger.set_redis_lru_cache(self.cache) + parea_logger.set_redis_cache(self.cache) _init_parea_wrapper(logger_all_possible, self.cache) def completion(self, data: Completion) -> CompletionResponse: diff --git a/parea/parea_logger.py b/parea/parea_logger.py index 796f4999..67f362b0 100644 --- a/parea/parea_logger.py +++ b/parea/parea_logger.py @@ -10,13 +10,13 @@ @define class PareaLogger: _client: HTTPClient = field(init=False, default=None) - _redis_lru_cache: RedisCache = field(init=False, default=None) + _redis_cache: RedisCache = field(init=False, default=None) def set_client(self, client: HTTPClient) -> None: self._client = client - def set_redis_lru_cache(self, cache: RedisCache) -> None: - self._redis_lru_cache = cache + def set_redis_cache(self, cache: RedisCache) -> None: + self._redis_cache = cache def record_log(self, data: TraceLog) -> None: self._client.request( @@ -33,10 +33,10 @@ async def arecord_log(self, data: TraceLog) -> None: ) def write_log(self, data: TraceLog) -> None: - self._redis_lru_cache.log(data) + self._redis_cache.log(data) def default_log(self, data: TraceLog) -> None: - if self._redis_lru_cache: + if self._redis_cache: self.write_log(data) if self._client: self.record_log(data) From d70807d1556589e58eadc7b8b7663a000a58ca02 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Tue, 17 Oct 2023 13:39:44 +0200 Subject: [PATCH 03/13] docs: update cookbook --- parea/cookbook/tracing_with_open_ai_endpoint_directly.py | 9 ++++++++- parea/schemas/models.py | 2 +- parea/utils/trace_utils.py | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/parea/cookbook/tracing_with_open_ai_endpoint_directly.py b/parea/cookbook/tracing_with_open_ai_endpoint_directly.py index 90002ad4..7b711321 100644 --- a/parea/cookbook/tracing_with_open_ai_endpoint_directly.py +++ b/parea/cookbook/tracing_with_open_ai_endpoint_directly.py @@ -1,5 +1,7 @@ import os +import random from datetime import datetime +from typing import Dict, Optional import openai from dotenv import load_dotenv @@ -19,7 +21,12 @@ def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float return openai.ChatCompletion.create(model=model, temperature=temperature, messages=data).choices[0].message["content"] -@trace +def random_eval(inputs: Dict[str, str], output, target: Optional[str] = None) -> float: + # return random number between 0 and 1 + return random.random() + + +@trace(eval_funcs=[random_eval]) def argumentor(query: str, additional_description: str = "") -> str: return call_llm( [ diff --git a/parea/schemas/models.py b/parea/schemas/models.py index d91001d0..71f2b962 100644 --- a/parea/schemas/models.py +++ b/parea/schemas/models.py @@ -133,7 +133,7 @@ class TraceLog: total_tokens: Optional[int] = 0 cost: Optional[float] = 0.0 evaluation_metric_ids: Optional[list[int]] = None - named_evaluation_scores: Optional[list[NamedEvaluationScore]] = None + scores: Optional[list[NamedEvaluationScore]] = None feedback_score: Optional[float] = None # info filled from decorator diff --git a/parea/utils/trace_utils.py b/parea/utils/trace_utils.py index b15de456..c4108107 100644 --- a/parea/utils/trace_utils.py +++ b/parea/utils/trace_utils.py @@ -184,11 +184,11 @@ def call_eval_funcs_then_log(trace_id: str, eval_funcs: list[Callable] = None, a output = access_output_of_func(output) target = data.target if eval_funcs and data.status == "success": - data.named_evaluation_scores = [] + data.scores = [] for func in eval_funcs: try: score = func(inputs=inputs, output=output, target=target) - data.named_evaluation_scores.append(NamedEvaluationScore(name=func.__name__, score=score)) + data.scores.append(NamedEvaluationScore(name=func.__name__, score=score)) except Exception as e: logger.exception(f"Error occurred calling evaluation function '{func.__name__}', {e}", exc_info=e) except Exception as e: From 1bc6bfbd83a8fe7691f035bd352d93a00e998c48 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Tue, 17 Oct 2023 13:39:55 +0200 Subject: [PATCH 04/13] docs: update cookbook --- parea/cookbook/tracing_with_open_ai_endpoint_directly.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parea/cookbook/tracing_with_open_ai_endpoint_directly.py b/parea/cookbook/tracing_with_open_ai_endpoint_directly.py index 7b711321..b245c1e7 100644 --- a/parea/cookbook/tracing_with_open_ai_endpoint_directly.py +++ b/parea/cookbook/tracing_with_open_ai_endpoint_directly.py @@ -55,7 +55,7 @@ def critic(argument: str) -> str: ) -@trace +@trace(eval_funcs=[random_eval]) def refiner(query: str, additional_description: str, argument: str, criticism: str) -> str: return call_llm( [ @@ -75,7 +75,7 @@ def refiner(query: str, additional_description: str, argument: str, criticism: s ) -@trace +@trace(eval_funcs=[random_eval], access_output_of_func=lambda x: x[0]) def argument_chain(query: str, additional_description: str = "") -> tuple[str, str]: trace_id = get_current_trace_id() argument = argumentor(query, additional_description) From b2a09632be6dde09727da72be313c4354da92f34 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Tue, 17 Oct 2023 18:47:28 +0200 Subject: [PATCH 05/13] feat: do not trace loacl eval metrics --- parea/wrapper/utils.py | 21 +++++++++++++++++++++ parea/wrapper/wrapper.py | 5 ++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 parea/wrapper/utils.py diff --git a/parea/wrapper/utils.py b/parea/wrapper/utils.py new file mode 100644 index 00000000..c90e2940 --- /dev/null +++ b/parea/wrapper/utils.py @@ -0,0 +1,21 @@ +from typing import Callable +import inspect +from functools import wraps + + +def skip_decorator_if_func_in_stack(func_to_check: Callable) -> Callable: + def decorator_wrapper(decorator: Callable) -> Callable: + + def new_decorator(self, func: Callable) -> Callable: # Include self + + @wraps(func) + def wrapper(*args, **kwargs): + if any(func_to_check.__name__ in frame.function for frame in inspect.stack()): + return func(*args, **kwargs) + return decorator(self, func)(*args, **kwargs) # Include self + + return wrapper + + return new_decorator + + return decorator_wrapper diff --git a/parea/wrapper/wrapper.py b/parea/wrapper/wrapper.py index 35f60fb0..bc1f35a9 100644 --- a/parea/wrapper/wrapper.py +++ b/parea/wrapper/wrapper.py @@ -8,7 +8,8 @@ from parea.cache.cache import Cache from parea.helpers import date_and_time_string_to_timestamp from parea.schemas.models import TraceLog -from parea.utils.trace_utils import to_date_and_time_string, trace_context, trace_data +from parea.utils.trace_utils import to_date_and_time_string, trace_context, trace_data, call_eval_funcs_then_log +from parea.wrapper.utils import skip_decorator_if_func_in_stack class Wrapper: @@ -88,6 +89,7 @@ def _init_trace(self) -> Tuple[str, float]: return trace_id, start_time + @skip_decorator_if_func_in_stack(call_eval_funcs_then_log) def async_decorator(self, orig_func: Callable) -> Callable: async def wrapper(*args, **kwargs): trace_id, start_time = self._init_trace() @@ -113,6 +115,7 @@ async def wrapper(*args, **kwargs): return wrapper + @skip_decorator_if_func_in_stack(call_eval_funcs_then_log) def sync_decorator(self, orig_func: Callable) -> Callable: def wrapper(*args, **kwargs): trace_id, start_time = self._init_trace() From bcd4e567bfb97987c10d594bd441c6030b03cfb7 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Tue, 17 Oct 2023 18:47:54 +0200 Subject: [PATCH 06/13] feat: add unhelpful chat example --- .../tracing_and_evaluating_openai_endpoint.py | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 parea/cookbook/tracing_and_evaluating_openai_endpoint.py diff --git a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py new file mode 100644 index 00000000..9b92c206 --- /dev/null +++ b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py @@ -0,0 +1,119 @@ +import os +from typing import Dict, List + +import openai +from dotenv import load_dotenv + +from parea import init +from parea.utils.trace_utils import trace + +load_dotenv() + +openai.api_key = os.getenv("OPENAI_API_KEY") + +init(api_key=os.getenv("PAREA_API_KEY")) + + +def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float = 0.0) -> str: + return openai.ChatCompletion.create(model=model, temperature=temperature, messages=data).choices[0].message["content"] + + +def friendliness(inputs: Dict, output: str, target: str = None) -> float: + response = call_llm( + [ + { + "role": "system", + "content": "You evaluate the friendliness of the following response on a scale of 0 to 10. You must only return a number." + }, + {"role": "assistant", "content": output}, + ], + model='gpt-4' + ) + try: + return float(response) / 10.0 + except TypeError: + return 0.0 + + +def usefulness(inputs: Dict, output: str, target: str = None) -> float: + user_input = inputs['messages'][-1]["content"] + response = call_llm( + [ + { + "role": "system", + "content": "You evaluate the usefulness of the response given the user input on a scale of 0 to 10. You must only return a number." + }, + {"role": "assistant", "content": f'''User input: "{user_input}"\nAssistant response: "{output}"'''} + ], + model='gpt-4' + ) + try: + return float(response) / 10.0 + except TypeError: + return 0.0 + + +@trace(eval_funcs=[friendliness, usefulness]) +def helpful_the_second_time(messages: List[Dict[str, str]]) -> str: + helpful_response = call_llm( + [ + { + "role": "system", + "content": "You are a friendly, and helpful assistant that helps people with their homework." + }, + + ] + messages, + model='gpt-4' + ) + + has_user_asked_before_raw = call_llm( + [ + { + "role": "system", + "content": "Assess if the user has asked the last question before or is asking again for more \ +information on a previous topic. If so, respond ASKED_BEFORE. Otherwise, respond NOT_ASKED_BEFORE." + } + ] + messages, + model='gpt-4' + ) + has_user_asked_before = has_user_asked_before_raw == "ASKED_BEFORE" + + if has_user_asked_before: + messages.append({"role": "assistant", "content": helpful_response}) + return helpful_response + else: + unhelfpul_response = call_llm( + [ + { + "role": "system", + "content": "Given the helpful response to the user input below, please provide a slightly unhelpful \ + response which makes the user ask again in case they didn't ask already again because of a previous unhelpful answer. \ + In case the user asked again, please provide a last response" + }, + ] + messages + [{"role": "assistant", "content": helpful_response}], + model='gpt-4' + ) + messages.append({"role": "assistant", "content": unhelfpul_response}) + return unhelfpul_response + + +@trace +def unhelpful_chat(): + print("Welcome to the chat! Type 'exit' to end the session.") + + messages = [] + while True: + user_input = input("\nYou: ") + + if user_input.lower() == 'exit': + print("Goodbye!") + break + + messages.append({"role": "user", "content": user_input}) + print("Bot:", helpful_the_second_time(messages)) + + return messages + + +if __name__ == "__main__": + unhelpful_chat() From 79665402761852cdb04fd093521a6799c4f41c18 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Tue, 17 Oct 2023 19:17:12 +0200 Subject: [PATCH 07/13] feat: allow export to CSV --- parea/benchmark.py | 18 ++++------- .../tracing_and_evaluating_openai_endpoint.py | 31 ++++++++++++++++--- parea/helpers.py | 17 ++++++++++ 3 files changed, 49 insertions(+), 17 deletions(-) diff --git a/parea/benchmark.py b/parea/benchmark.py index a736bd5f..b90e4722 100644 --- a/parea/benchmark.py +++ b/parea/benchmark.py @@ -13,6 +13,7 @@ from tqdm import tqdm from parea.cache.redis import RedisCache +from parea.helpers import write_trace_logs_to_csv from parea.schemas.models import TraceLog @@ -73,19 +74,12 @@ def run_benchmark(args): pass print(f"Done with {len(futures)} inputs") - redis_cache = RedisCache(key_logs=redis_logs_key) - - trace_logs: list[TraceLog] = redis_cache.read_logs() + redis_cache = RedisCache( + key_logs=redis_logs_key, host=args.redis_host, port=args.redis_port, password=args.redis_password + ) # write to csv path_csv = f"trace_logs-{int(time.time())}.csv" - with open(path_csv, "w", newline="") as file: - # write header - columns = fields_dict(TraceLog).keys() - writer = csv.DictWriter(file, fieldnames=columns) - writer.writeheader() - # write rows - for trace_log in trace_logs: - writer.writerow(asdict(trace_log)) - + trace_logs: list[TraceLog] = redis_cache.read_logs() + write_trace_logs_to_csv(path_csv, trace_logs) print(f"Wrote CSV of results to: {path_csv}") diff --git a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py index 9b92c206..9660be8f 100644 --- a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py +++ b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py @@ -1,17 +1,22 @@ import os +import time from typing import Dict, List import openai from dotenv import load_dotenv -from parea import init -from parea.utils.trace_utils import trace +from parea import init, RedisCache +from parea.helpers import write_trace_logs_to_csv +from parea.utils.trace_utils import trace, get_current_trace_id load_dotenv() openai.api_key = os.getenv("OPENAI_API_KEY") -init(api_key=os.getenv("PAREA_API_KEY")) + +use_cache = True +cache = RedisCache() if use_cache else None +init(api_key=os.getenv("PAREA_API_KEY"), cache=cache) def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float = 0.0) -> str: @@ -101,6 +106,8 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str: def unhelpful_chat(): print("Welcome to the chat! Type 'exit' to end the session.") + trace_id = get_current_trace_id() + messages = [] while True: user_input = input("\nYou: ") @@ -112,8 +119,22 @@ def unhelpful_chat(): messages.append({"role": "user", "content": user_input}) print("Bot:", helpful_the_second_time(messages)) - return messages + return messages, trace_id + + +def main(): + _ , trace_id = unhelpful_chat() + + time.sleep(0.2) + + if use_cache: + path_csv = f"trace_logs-{int(time.time())}.csv" + trace_logs = cache.read_logs() + write_trace_logs_to_csv(path_csv, trace_logs) + print(f"CSV-file of results: {path_csv}") + if os.getenv("PAREA_API_KEY"): + print(f'You can view the logs at: https://optimusprompt.ai/logs/detailed/{trace_id}') if __name__ == "__main__": - unhelpful_chat() + main() diff --git a/parea/helpers.py b/parea/helpers.py index 67eac45b..b4391d61 100644 --- a/parea/helpers.py +++ b/parea/helpers.py @@ -1,5 +1,11 @@ +import csv import time import uuid +from typing import List + +from attr import fields_dict, asdict + +from parea.schemas.models import TraceLog def gen_trace_id() -> str: @@ -13,3 +19,14 @@ def to_date_and_time_string(timestamp: float) -> str: def date_and_time_string_to_timestamp(date_and_time_string: str) -> float: return time.mktime(time.strptime(date_and_time_string, "%Y-%m-%d %H:%M:%S %Z")) + + +def write_trace_logs_to_csv(path_csv: str, trace_logs: List[TraceLog]): + with open(path_csv, "w", newline="") as file: + # write header + columns = fields_dict(TraceLog).keys() + writer = csv.DictWriter(file, fieldnames=columns) + writer.writeheader() + # write rows + for trace_log in trace_logs: + writer.writerow(asdict(trace_log)) From 2257df657176362545d5c85b374a0a111457c3aa Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Tue, 17 Oct 2023 23:20:27 +0200 Subject: [PATCH 08/13] feat: add goal success ratio --- .../tracing_and_evaluating_openai_endpoint.py | 45 ++++++++++++++++--- parea/utils/trace_utils.py | 6 ++- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py index 9660be8f..1bd7a2b0 100644 --- a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py +++ b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py @@ -1,3 +1,4 @@ +import json import os import time from typing import Dict, List @@ -23,6 +24,41 @@ def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float return openai.ChatCompletion.create(model=model, temperature=temperature, messages=data).choices[0].message["content"] +def goal_success_ratio(inputs: Dict, output: str, target: str = None) -> float: + """Returns the average amount of turns the user had to converse with the AI to reach their goals.""" + output = json.loads(output) + # need to determine where does a new goal start + conversation_segments = [] + start_index = 0 + end_index = 3 + while end_index < len(output): + user_follows_same_goal = call_llm( + [ + { + "role": "system", + "content": "Look at the conversation and to determine if the user is still following the same goal " + "or if they are following a new goal. If they are following the same goal, respond " + "SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!" + } + ] + output[start_index:end_index], + model='gpt-4' + ) + + if user_follows_same_goal == "SAME_GOAL": + end_index += 2 + else: + conversation_segments.append(output[start_index:end_index - 1]) + start_index = end_index - 1 + end_index += 2 + + if start_index < len(output): + conversation_segments.append(output[start_index:]) + + # for now assume that the user reached their goal in every segment + # so we can return the average amount of turns the user had to converse with the AI to reach their goals + return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments) + + def friendliness(inputs: Dict, output: str, target: str = None) -> float: response = call_llm( [ @@ -102,7 +138,7 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str: return unhelfpul_response -@trace +@trace(eval_funcs=[goal_success_ratio], access_output_of_func=lambda x: x[0]) def unhelpful_chat(): print("Welcome to the chat! Type 'exit' to end the session.") @@ -125,15 +161,14 @@ def unhelpful_chat(): def main(): _ , trace_id = unhelpful_chat() - time.sleep(0.2) - + if os.getenv("PAREA_API_KEY"): + print(f'You can view the logs at: https://optimusprompt.ai/logs/detailed/{trace_id}') if use_cache: + time.sleep(5) # wait for local eval function to finish path_csv = f"trace_logs-{int(time.time())}.csv" trace_logs = cache.read_logs() write_trace_logs_to_csv(path_csv, trace_logs) print(f"CSV-file of results: {path_csv}") - if os.getenv("PAREA_API_KEY"): - print(f'You can view the logs at: https://optimusprompt.ai/logs/detailed/{trace_id}') if __name__ == "__main__": diff --git a/parea/utils/trace_utils.py b/parea/utils/trace_utils.py index c4108107..37035f03 100644 --- a/parea/utils/trace_utils.py +++ b/parea/utils/trace_utils.py @@ -180,10 +180,12 @@ def call_eval_funcs_then_log(trace_id: str, eval_funcs: list[Callable] = None, a try: inputs = data.inputs output = data.output - if access_output_of_func: - output = access_output_of_func(output) target = data.target if eval_funcs and data.status == "success": + if access_output_of_func: + output = json.loads(output) + output = access_output_of_func(output) + output = json.dumps(output) data.scores = [] for func in eval_funcs: try: From 924f11fef896eeec6ef648b9dd372004f89a6049 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Tue, 17 Oct 2023 23:21:23 +0200 Subject: [PATCH 09/13] style --- parea/benchmark.py | 4 +- .../tracing_and_evaluating_openai_endpoint.py | 68 +++++++++---------- .../tracing_with_open_ai_endpoint_directly.py | 3 +- parea/helpers.py | 5 +- parea/utils/trace_utils.py | 4 +- parea/wrapper/utils.py | 3 +- parea/wrapper/wrapper.py | 2 +- 7 files changed, 41 insertions(+), 48 deletions(-) diff --git a/parea/benchmark.py b/parea/benchmark.py index b90e4722..caea82a1 100644 --- a/parea/benchmark.py +++ b/parea/benchmark.py @@ -74,9 +74,7 @@ def run_benchmark(args): pass print(f"Done with {len(futures)} inputs") - redis_cache = RedisCache( - key_logs=redis_logs_key, host=args.redis_host, port=args.redis_port, password=args.redis_password - ) + redis_cache = RedisCache(key_logs=redis_logs_key, host=args.redis_host, port=args.redis_port, password=args.redis_password) # write to csv path_csv = f"trace_logs-{int(time.time())}.csv" diff --git a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py index 1bd7a2b0..8086b1fb 100644 --- a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py +++ b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py @@ -1,14 +1,15 @@ +from typing import Dict, List + import json import os import time -from typing import Dict, List import openai from dotenv import load_dotenv -from parea import init, RedisCache +from parea import RedisCache, init from parea.helpers import write_trace_logs_to_csv -from parea.utils.trace_utils import trace, get_current_trace_id +from parea.utils.trace_utils import get_current_trace_id, trace load_dotenv() @@ -37,17 +38,18 @@ def goal_success_ratio(inputs: Dict, output: str, target: str = None) -> float: { "role": "system", "content": "Look at the conversation and to determine if the user is still following the same goal " - "or if they are following a new goal. If they are following the same goal, respond " - "SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!" + "or if they are following a new goal. If they are following the same goal, respond " + "SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!", } - ] + output[start_index:end_index], - model='gpt-4' + ] + + output[start_index:end_index], + model="gpt-4", ) if user_follows_same_goal == "SAME_GOAL": end_index += 2 else: - conversation_segments.append(output[start_index:end_index - 1]) + conversation_segments.append(output[start_index : end_index - 1]) start_index = end_index - 1 end_index += 2 @@ -62,13 +64,10 @@ def goal_success_ratio(inputs: Dict, output: str, target: str = None) -> float: def friendliness(inputs: Dict, output: str, target: str = None) -> float: response = call_llm( [ - { - "role": "system", - "content": "You evaluate the friendliness of the following response on a scale of 0 to 10. You must only return a number." - }, + {"role": "system", "content": "You evaluate the friendliness of the following response on a scale of 0 to 10. You must only return a number."}, {"role": "assistant", "content": output}, ], - model='gpt-4' + model="gpt-4", ) try: return float(response) / 10.0 @@ -77,16 +76,13 @@ def friendliness(inputs: Dict, output: str, target: str = None) -> float: def usefulness(inputs: Dict, output: str, target: str = None) -> float: - user_input = inputs['messages'][-1]["content"] + user_input = inputs["messages"][-1]["content"] response = call_llm( [ - { - "role": "system", - "content": "You evaluate the usefulness of the response given the user input on a scale of 0 to 10. You must only return a number." - }, - {"role": "assistant", "content": f'''User input: "{user_input}"\nAssistant response: "{output}"'''} + {"role": "system", "content": "You evaluate the usefulness of the response given the user input on a scale of 0 to 10. You must only return a number."}, + {"role": "assistant", "content": f'''User input: "{user_input}"\nAssistant response: "{output}"'''}, ], - model='gpt-4' + model="gpt-4", ) try: return float(response) / 10.0 @@ -98,13 +94,10 @@ def usefulness(inputs: Dict, output: str, target: str = None) -> float: def helpful_the_second_time(messages: List[Dict[str, str]]) -> str: helpful_response = call_llm( [ - { - "role": "system", - "content": "You are a friendly, and helpful assistant that helps people with their homework." - }, - - ] + messages, - model='gpt-4' + {"role": "system", "content": "You are a friendly, and helpful assistant that helps people with their homework."}, + ] + + messages, + model="gpt-4", ) has_user_asked_before_raw = call_llm( @@ -112,10 +105,11 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str: { "role": "system", "content": "Assess if the user has asked the last question before or is asking again for more \ -information on a previous topic. If so, respond ASKED_BEFORE. Otherwise, respond NOT_ASKED_BEFORE." +information on a previous topic. If so, respond ASKED_BEFORE. Otherwise, respond NOT_ASKED_BEFORE.", } - ] + messages, - model='gpt-4' + ] + + messages, + model="gpt-4", ) has_user_asked_before = has_user_asked_before_raw == "ASKED_BEFORE" @@ -129,10 +123,12 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str: "role": "system", "content": "Given the helpful response to the user input below, please provide a slightly unhelpful \ response which makes the user ask again in case they didn't ask already again because of a previous unhelpful answer. \ - In case the user asked again, please provide a last response" + In case the user asked again, please provide a last response", }, - ] + messages + [{"role": "assistant", "content": helpful_response}], - model='gpt-4' + ] + + messages + + [{"role": "assistant", "content": helpful_response}], + model="gpt-4", ) messages.append({"role": "assistant", "content": unhelfpul_response}) return unhelfpul_response @@ -148,7 +144,7 @@ def unhelpful_chat(): while True: user_input = input("\nYou: ") - if user_input.lower() == 'exit': + if user_input.lower() == "exit": print("Goodbye!") break @@ -159,10 +155,10 @@ def unhelpful_chat(): def main(): - _ , trace_id = unhelpful_chat() + _, trace_id = unhelpful_chat() if os.getenv("PAREA_API_KEY"): - print(f'You can view the logs at: https://optimusprompt.ai/logs/detailed/{trace_id}') + print(f"You can view the logs at: https://optimusprompt.ai/logs/detailed/{trace_id}") if use_cache: time.sleep(5) # wait for local eval function to finish path_csv = f"trace_logs-{int(time.time())}.csv" diff --git a/parea/cookbook/tracing_with_open_ai_endpoint_directly.py b/parea/cookbook/tracing_with_open_ai_endpoint_directly.py index b245c1e7..14cd4656 100644 --- a/parea/cookbook/tracing_with_open_ai_endpoint_directly.py +++ b/parea/cookbook/tracing_with_open_ai_endpoint_directly.py @@ -1,7 +1,8 @@ +from typing import Dict, Optional + import os import random from datetime import datetime -from typing import Dict, Optional import openai from dotenv import load_dotenv diff --git a/parea/helpers.py b/parea/helpers.py index b4391d61..e80c8e45 100644 --- a/parea/helpers.py +++ b/parea/helpers.py @@ -1,9 +1,8 @@ import csv import time import uuid -from typing import List -from attr import fields_dict, asdict +from attr import asdict, fields_dict from parea.schemas.models import TraceLog @@ -21,7 +20,7 @@ def date_and_time_string_to_timestamp(date_and_time_string: str) -> float: return time.mktime(time.strptime(date_and_time_string, "%Y-%m-%d %H:%M:%S %Z")) -def write_trace_logs_to_csv(path_csv: str, trace_logs: List[TraceLog]): +def write_trace_logs_to_csv(path_csv: str, trace_logs: list[TraceLog]): with open(path_csv, "w", newline="") as file: # write header columns = fields_dict(TraceLog).keys() diff --git a/parea/utils/trace_utils.py b/parea/utils/trace_utils.py index 37035f03..c8080308 100644 --- a/parea/utils/trace_utils.py +++ b/parea/utils/trace_utils.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Union, Callable +from typing import Any, Callable, Optional, Union import contextvars import inspect @@ -13,7 +13,7 @@ from parea.helpers import gen_trace_id, to_date_and_time_string from parea.parea_logger import parea_logger -from parea.schemas.models import CompletionResponse, TraceLog, NamedEvaluationScore +from parea.schemas.models import CompletionResponse, NamedEvaluationScore, TraceLog logger = logging.getLogger() diff --git a/parea/wrapper/utils.py b/parea/wrapper/utils.py index c90e2940..50cc1ecf 100644 --- a/parea/wrapper/utils.py +++ b/parea/wrapper/utils.py @@ -1,13 +1,12 @@ from typing import Callable + import inspect from functools import wraps def skip_decorator_if_func_in_stack(func_to_check: Callable) -> Callable: def decorator_wrapper(decorator: Callable) -> Callable: - def new_decorator(self, func: Callable) -> Callable: # Include self - @wraps(func) def wrapper(*args, **kwargs): if any(func_to_check.__name__ in frame.function for frame in inspect.stack()): diff --git a/parea/wrapper/wrapper.py b/parea/wrapper/wrapper.py index bc1f35a9..81fc9e55 100644 --- a/parea/wrapper/wrapper.py +++ b/parea/wrapper/wrapper.py @@ -8,7 +8,7 @@ from parea.cache.cache import Cache from parea.helpers import date_and_time_string_to_timestamp from parea.schemas.models import TraceLog -from parea.utils.trace_utils import to_date_and_time_string, trace_context, trace_data, call_eval_funcs_then_log +from parea.utils.trace_utils import call_eval_funcs_then_log, to_date_and_time_string, trace_context, trace_data from parea.wrapper.utils import skip_decorator_if_func_in_stack From 1150e9b0c2e292f27e7d7d72b93eebb933eb2473 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Fri, 20 Oct 2023 10:18:29 +0200 Subject: [PATCH 10/13] feat: expose eval metric names via trace decorator --- parea/cookbook/tracing_and_evaluating_openai_endpoint.py | 7 +++++++ parea/schemas/models.py | 2 +- parea/utils/trace_utils.py | 3 +++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py index 8086b1fb..0b387a17 100644 --- a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py +++ b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py @@ -165,6 +165,13 @@ def main(): trace_logs = cache.read_logs() write_trace_logs_to_csv(path_csv, trace_logs) print(f"CSV-file of results: {path_csv}") + parent_trace = None + for trace_log in trace_logs: + if trace_log.trace_id == trace_id: + parent_trace = trace_log + break + if parent_trace: + print(f'Overall score(s):\n{json.dumps(parent_trace.scores)}') if __name__ == "__main__": diff --git a/parea/schemas/models.py b/parea/schemas/models.py index 71f2b962..5cf5928f 100644 --- a/parea/schemas/models.py +++ b/parea/schemas/models.py @@ -132,7 +132,7 @@ class TraceLog: output_tokens: Optional[int] = 0 total_tokens: Optional[int] = 0 cost: Optional[float] = 0.0 - evaluation_metric_ids: Optional[list[int]] = None + evaluation_metric_names: Optional[list[str]] = None scores: Optional[list[NamedEvaluationScore]] = None feedback_score: Optional[float] = None diff --git a/parea/utils/trace_utils.py b/parea/utils/trace_utils.py index c8080308..6efab992 100644 --- a/parea/utils/trace_utils.py +++ b/parea/utils/trace_utils.py @@ -55,6 +55,7 @@ def trace( metadata: Optional[dict[str, Any]] = None, target: Optional[str] = None, end_user_identifier: Optional[str] = None, + eval_funcs_names: Optional[list[str]] = None, eval_funcs: Optional[list[Callable]] = None, access_output_of_func: Optional[Callable] = None, ): @@ -102,6 +103,7 @@ async def async_wrapper(*args, **kwargs): output = make_output(result, output_as_list) trace_data.get()[trace_id].output = output if isinstance(output, str) else json.dumps(output) trace_data.get()[trace_id].status = "success" + trace_data.get()[trace_id].evaluation_metric_names = eval_funcs_names except Exception as e: logger.exception(f"Error occurred in function {func.__name__}, {e}") trace_data.get()[trace_id].error = str(e) @@ -120,6 +122,7 @@ def wrapper(*args, **kwargs): output = make_output(result, output_as_list) trace_data.get()[trace_id].output = output if isinstance(output, str) else json.dumps(output) trace_data.get()[trace_id].status = "success" + trace_data.get()[trace_id].evaluation_metric_names = eval_funcs_names except Exception as e: logger.exception(f"Error occurred in function {func.__name__}, {e}") trace_data.get()[trace_id].error = str(e) From b98b2269647fa2fd0753a68427e4b94d8f180638 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Fri, 20 Oct 2023 11:44:21 +0200 Subject: [PATCH 11/13] feat: send transformed output for eval metric to backend --- parea/schemas/models.py | 1 + parea/utils/trace_utils.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/parea/schemas/models.py b/parea/schemas/models.py index 5cf5928f..c336b71c 100644 --- a/parea/schemas/models.py +++ b/parea/schemas/models.py @@ -132,6 +132,7 @@ class TraceLog: output_tokens: Optional[int] = 0 total_tokens: Optional[int] = 0 cost: Optional[float] = 0.0 + output_for_eval_metrics: Optional[str] = None evaluation_metric_names: Optional[list[str]] = None scores: Optional[list[NamedEvaluationScore]] = None feedback_score: Optional[float] = None diff --git a/parea/utils/trace_utils.py b/parea/utils/trace_utils.py index 6efab992..d85ca4ca 100644 --- a/parea/utils/trace_utils.py +++ b/parea/utils/trace_utils.py @@ -182,17 +182,19 @@ def call_eval_funcs_then_log(trace_id: str, eval_funcs: list[Callable] = None, a data = trace_data.get()[trace_id] try: inputs = data.inputs - output = data.output target = data.target + if access_output_of_func: + output = json.loads(data.output) + output = access_output_of_func(output) + output_for_eval_metrics = json.dumps(output) + else: + output_for_eval_metrics = data.output + data.output_for_eval_metrics = output_for_eval_metrics if eval_funcs and data.status == "success": - if access_output_of_func: - output = json.loads(output) - output = access_output_of_func(output) - output = json.dumps(output) data.scores = [] for func in eval_funcs: try: - score = func(inputs=inputs, output=output, target=target) + score = func(inputs=inputs, output=output_for_eval_metrics, target=target) data.scores.append(NamedEvaluationScore(name=func.__name__, score=score)) except Exception as e: logger.exception(f"Error occurred calling evaluation function '{func.__name__}', {e}", exc_info=e) From 089e26ba5b923a26707ba1e53bd56a8f78a2652f Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Fri, 20 Oct 2023 12:16:14 +0200 Subject: [PATCH 12/13] chore: bump version --- parea/cookbook/tracing_and_evaluating_openai_endpoint.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py index 0b387a17..b1989f71 100644 --- a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py +++ b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py @@ -171,7 +171,7 @@ def main(): parent_trace = trace_log break if parent_trace: - print(f'Overall score(s):\n{json.dumps(parent_trace.scores)}') + print(f"Overall score(s):\n{json.dumps(parent_trace.scores)}") if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 3b6f2487..d4b3be8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "parea-ai" packages = [{ include = "parea" }] -version = "0.2.12" +version = "0.2.13" description = "Parea python sdk" readme = "README.md" authors = ["joel-parea-ai "] From 68944a41103edb0d90adc7bf4e98084c1ce57be0 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Fri, 20 Oct 2023 12:20:34 +0200 Subject: [PATCH 13/13] chore: bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a8a6817c..d4b3be8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "parea-ai" packages = [{ include = "parea" }] -version = "0.2.13a0" +version = "0.2.13" description = "Parea python sdk" readme = "README.md" authors = ["joel-parea-ai "]