From 9f7ea2abee20a16ac23b20b4326fd32652c45b77 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Mon, 4 Dec 2023 10:37:34 -0500 Subject: [PATCH 1/2] feat: update cookbook for chat --- parea/__init__.py | 2 +- parea/cache/__init__.py | 1 + parea/cache/cache.py | 24 ++++- parea/cache/in_memory.py | 39 ++++++++ parea/client.py | 4 +- .../tracing_and_evaluating_openai_endpoint.py | 75 +++++---------- parea/evals/chat.py | 92 ++++++++++++------- parea/utils/trace_utils.py | 10 +- 8 files changed, 154 insertions(+), 93 deletions(-) create mode 100644 parea/cache/in_memory.py diff --git a/parea/__init__.py b/parea/__init__.py index 8ce05d05..9205e1b5 100644 --- a/parea/__init__.py +++ b/parea/__init__.py @@ -12,7 +12,7 @@ from importlib import metadata as importlib_metadata from parea.benchmark import run_benchmark -from parea.cache import RedisCache +from parea.cache import RedisCache, InMemoryCache from parea.client import Parea, init diff --git a/parea/cache/__init__.py b/parea/cache/__init__.py index 86b8c5ec..f309ee48 100644 --- a/parea/cache/__init__.py +++ b/parea/cache/__init__.py @@ -1 +1,2 @@ from .redis import RedisCache +from .in_memory import InMemoryCache diff --git a/parea/cache/cache.py b/parea/cache/cache.py index 35dff655..a5779343 100644 --- a/parea/cache/cache.py +++ b/parea/cache/cache.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, List from abc import ABC @@ -81,3 +81,25 @@ async def ainvalidate(self, key: CacheRequest): # noqa: DAR401 """ raise NotImplementedError + + def log(self, value: TraceLog): + """ + Log a response in the cache. + + Args: + value (TraceLog): The response to log. + + # noqa: DAR401 + """ + raise NotImplementedError + + def read_logs(self) -> List[TraceLog]: + """ + Read all logs from the cache. + + Returns: + List[TraceLog]: All logs in the cache. + + # noqa: DAR401 + """ + raise NotImplementedError diff --git a/parea/cache/in_memory.py b/parea/cache/in_memory.py new file mode 100644 index 00000000..c45cc1ba --- /dev/null +++ b/parea/cache/in_memory.py @@ -0,0 +1,39 @@ +import json +from typing import Optional, List + +from attr import asdict + +from parea.cache.cache import Cache +from parea.schemas.models import CacheRequest, TraceLog + + +class InMemoryCache(Cache): + def __init__(self): + self.cache = {} + self.logs = [] + + def get(self, key: CacheRequest) -> Optional[TraceLog]: + return self.cache.get(json.dumps(asdict(key))) + + async def aget(self, key: CacheRequest) -> Optional[TraceLog]: + return self.get(key) + + def set(self, key: CacheRequest, value: TraceLog): + self.cache[json.dumps(asdict(key))] = value + + async def aset(self, key: CacheRequest, value: TraceLog): + self.set(key, value) + + def invalidate(self, key: CacheRequest): + key = json.dumps(asdict(key)) + if key in self.cache: + del self.cache[key] + + async def ainvalidate(self, key: CacheRequest): + self.invalidate(key) + + def log(self, value: TraceLog): + self.logs.append(value) + + def read_logs(self) -> List[TraceLog]: + return self.logs.copy() diff --git a/parea/client.py b/parea/client.py index f0043dfb..26d2260f 100644 --- a/parea/client.py +++ b/parea/client.py @@ -8,7 +8,7 @@ from parea.api_client import HTTPClient from parea.cache.cache import Cache -from parea.cache.redis import RedisCache +from parea.cache import RedisCache, InMemoryCache from parea.helpers import gen_trace_id from parea.parea_logger import parea_logger from parea.schemas.models import Completion, CompletionResponse, FeedbackRequest, UseDeployedPrompt, UseDeployedPromptResponse @@ -30,7 +30,7 @@ def __attrs_post_init__(self): self._client.set_api_key(self.api_key) if self.api_key: parea_logger.set_client(self._client) - if isinstance(self.cache, RedisCache): + if isinstance(self.cache, (RedisCache, InMemoryCache)): parea_logger.set_redis_cache(self.cache) _init_parea_wrapper(logger_all_possible, self.cache) diff --git a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py index 5c504996..7531ac03 100644 --- a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py +++ b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py @@ -5,10 +5,14 @@ import time import openai +from attr import asdict from dotenv import load_dotenv -from parea import RedisCache, init +from parea import init, InMemoryCache +from parea.evals.chat import goal_success_ratio_factory +from parea.evals.utils import call_openai from parea.helpers import write_trace_logs_to_csv +from parea.schemas.models import Log from parea.utils.trace_utils import get_current_trace_id, trace load_dotenv() @@ -17,52 +21,13 @@ use_cache = True -cache = RedisCache() if use_cache else None +cache = InMemoryCache() if use_cache else None init(api_key=os.getenv("PAREA_API_KEY"), cache=cache) -def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float = 0.0) -> str: - return openai.ChatCompletion.create(model=model, temperature=temperature, messages=data).choices[0].message["content"] - - -def goal_success_ratio(inputs: Dict, output: str, target: str = None) -> float: - """Returns the average amount of turns the user had to converse with the AI to reach their goals.""" - output = json.loads(output) - # need to determine where does a new goal start - conversation_segments = [] - start_index = 0 - end_index = 3 - while end_index < len(output): - user_follows_same_goal = call_llm( - [ - { - "role": "system", - "content": "Look at the conversation and to determine if the user is still following the same goal " - "or if they are following a new goal. If they are following the same goal, respond " - "SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!", - } - ] - + output[start_index:end_index], - model="gpt-4", - ) - - if user_follows_same_goal == "SAME_GOAL": - end_index += 2 - else: - conversation_segments.append(output[start_index : end_index - 1]) - start_index = end_index - 1 - end_index += 2 - - if start_index < len(output): - conversation_segments.append(output[start_index:]) - - # for now assume that the user reached their goal in every segment - # so we can return the average amount of turns the user had to converse with the AI to reach their goals - return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments) - - -def friendliness(inputs: Dict, output: str, target: str = None) -> float: - response = call_llm( +def friendliness(log: Log) -> float: + output = log.output + response = call_openai( [ {"role": "system", "content": "You evaluate the friendliness of the following response on a scale of 0 to 10. You must only return a number."}, {"role": "assistant", "content": output}, @@ -75,9 +40,10 @@ def friendliness(inputs: Dict, output: str, target: str = None) -> float: return 0.0 -def usefulness(inputs: Dict, output: str, target: str = None) -> float: - user_input = inputs["messages"][-1]["content"] - response = call_llm( +def usefulness(log: Log) -> float: + user_input = log.inputs['messages'][-1]['content'] + output = log.output + response = call_openai( [ {"role": "system", "content": "You evaluate the usefulness of the response given the user input on a scale of 0 to 10. You must only return a number."}, {"role": "assistant", "content": f'''User input: "{user_input}"\nAssistant response: "{output}"'''}, @@ -92,7 +58,7 @@ def usefulness(inputs: Dict, output: str, target: str = None) -> float: @trace(eval_funcs=[friendliness, usefulness]) def helpful_the_second_time(messages: List[Dict[str, str]]) -> str: - helpful_response = call_llm( + helpful_response = call_openai( [ {"role": "system", "content": "You are a friendly, and helpful assistant that helps people with their homework."}, ] @@ -100,7 +66,7 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str: model="gpt-4", ) - has_user_asked_before_raw = call_llm( + has_user_asked_before_raw = call_openai( [ { "role": "system", @@ -117,7 +83,7 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str: messages.append({"role": "assistant", "content": helpful_response}) return helpful_response else: - unhelfpul_response = call_llm( + unhelfpul_response = call_openai( [ { "role": "system", @@ -134,9 +100,12 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str: return unhelfpul_response +goal_success_ratio = goal_success_ratio_factory(use_output=True) + + @trace(eval_funcs=[goal_success_ratio], access_output_of_func=lambda x: x[0]) def unhelpful_chat(): - print("Welcome to the chat! Type 'exit' to end the session.") + print("\nWelcome to the somewhat helpful chat! Type 'exit' to end the session.") trace_id = get_current_trace_id() @@ -164,14 +133,14 @@ def main(): path_csv = f"trace_logs-{int(time.time())}.csv" trace_logs = cache.read_logs() write_trace_logs_to_csv(path_csv, trace_logs) - print(f"CSV-file of results: {path_csv}") + print(f"\nCSV-file of traces: {path_csv}") parent_trace = None for trace_log in trace_logs: if trace_log.trace_id == trace_id: parent_trace = trace_log break if parent_trace: - print(f"Overall score(s):\n{json.dumps(parent_trace.scores)}") + print(f"Overall score(s):\n{json.dumps(parent_trace.scores, default=asdict, indent=2)}") if __name__ == "__main__": diff --git a/parea/evals/chat.py b/parea/evals/chat.py index ae9fe3cc..ab9889e7 100644 --- a/parea/evals/chat.py +++ b/parea/evals/chat.py @@ -1,40 +1,66 @@ +import json +from typing import Optional, Callable + from parea.evals.utils import call_openai from parea.schemas.models import Log -def goal_success_ratio(log: Log) -> float: - """Returns the average amount of turns the user had to converse with the AI to reach their goals.""" - messages = [m.to_dict() for m in log.configuration.messages] - messages.append({"role": "assistant", "content": log.output}) - - # need to determine where does a new goal start - conversation_segments = [] - start_index = 0 - end_index = 3 - while end_index < len(messages): - user_follows_same_goal = call_openai( - [ - { - "role": "system", - "content": "Look at the conversation and to determine if the user is still following the same goal " - "or if they are following a new goal. If they are following the same goal, respond " - "SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!", - } - ] - + messages[start_index:end_index], - model="gpt-4", - ) - - if user_follows_same_goal == "SAME_GOAL": - end_index += 2 +def goal_success_ratio_factory( + use_output: Optional[bool] = False, + message_field: Optional[str] = None +) -> Callable[[Log], float]: + """Factory function that returns a function that calculates the goal success ratio of a log. + + Args: + use_output (Optional[bool], optional): Whether to use the output of the log to access the messages. Defaults to False. + message_field (Optional[str], optional): The name of the field in the log that contains the messages. + Defaults to None. If None, the messages are taken from the configuration attribute. + """ + if use_output and message_field: + raise ValueError("Only one of use_output and message_field can be set.") + + def goal_success_ratio(log: Log) -> float: + """Returns the average amount of turns the user had to converse with the AI to reach their goals.""" + if use_output: + output_list_dicts = json.loads(log.output) + messages = [m for m in output_list_dicts] + elif message_field: + messages = [m for m in log.inputs[message_field]] else: - conversation_segments.append(messages[start_index : end_index - 1]) - start_index = end_index - 1 - end_index += 2 + messages = [m.to_dict() for m in log.configuration.messages] + if log.output: + messages.append({"role": "assistant", "content": log.output}) + + # need to determine where does a new goal start + conversation_segments = [] + start_index = 0 + end_index = 3 + while end_index < len(messages): + user_follows_same_goal = call_openai( + [ + { + "role": "system", + "content": "Look at the conversation and to determine if the user is still following the same goal " + "or if they are following a new goal. If they are following the same goal, respond " + "SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!", + } + ] + + messages[start_index:end_index], + model="gpt-4", + ) + + if user_follows_same_goal == "SAME_GOAL": + end_index += 2 + else: + conversation_segments.append(messages[start_index : end_index - 1]) + start_index = end_index - 1 + end_index += 2 + + if start_index < len(messages): + conversation_segments.append(messages[start_index:]) - if start_index < len(messages): - conversation_segments.append(messages[start_index:]) + # for now assume that the user reached their goal in every segment + # return the average amount of turns the user had to converse with the AI to reach their goals + return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments) - # for now assume that the user reached their goal in every segment - # return the average amount of turns the user had to converse with the AI to reach their goals - return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments) + return goal_success_ratio diff --git a/parea/utils/trace_utils.py b/parea/utils/trace_utils.py index 05a477bb..3f44da1e 100644 --- a/parea/utils/trace_utils.py +++ b/parea/utils/trace_utils.py @@ -179,8 +179,6 @@ def logger_all_possible(trace_id: str): def call_eval_funcs_then_log(trace_id: str, eval_funcs: list[Callable] = None, access_output_of_func: Callable = None): data = trace_data.get()[trace_id] try: - inputs = data.inputs - target = data.target if eval_funcs and data.status == "success": if access_output_of_func: output = json.loads(data.output) @@ -188,14 +186,20 @@ def call_eval_funcs_then_log(trace_id: str, eval_funcs: list[Callable] = None, a output_for_eval_metrics = json_dumps(output) else: output_for_eval_metrics = data.output + data.output_for_eval_metrics = output_for_eval_metrics + output_old = data.output + data.output = data.output_for_eval_metrics data.scores = [] + for func in eval_funcs: try: - score = func(inputs=inputs, output=output_for_eval_metrics, target=target) + score = func(data) data.scores.append(NamedEvaluationScore(name=func.__name__, score=score)) except Exception as e: logger.exception(f"Error occurred calling evaluation function '{func.__name__}', {e}", exc_info=e) + + data.output = output_old except Exception as e: logger.exception(f"Error occurred in when trying to evaluate output, {e}", exc_info=e) parea_logger.default_log(data=data) From 47194967982b3f7bebb11813c063d05963cabbe2 Mon Sep 17 00:00:00 2001 From: Joschka Braun Date: Mon, 4 Dec 2023 10:39:32 -0500 Subject: [PATCH 2/2] chore: bump version, style --- parea/__init__.py | 2 +- parea/cache/__init__.py | 2 +- parea/cache/cache.py | 2 +- parea/cache/in_memory.py | 3 ++- parea/client.py | 2 +- parea/cookbook/tracing_and_evaluating_openai_endpoint.py | 4 ++-- parea/evals/chat.py | 8 +++----- pyproject.toml | 2 +- 8 files changed, 12 insertions(+), 13 deletions(-) diff --git a/parea/__init__.py b/parea/__init__.py index 9205e1b5..1fc6c484 100644 --- a/parea/__init__.py +++ b/parea/__init__.py @@ -12,7 +12,7 @@ from importlib import metadata as importlib_metadata from parea.benchmark import run_benchmark -from parea.cache import RedisCache, InMemoryCache +from parea.cache import InMemoryCache, RedisCache from parea.client import Parea, init diff --git a/parea/cache/__init__.py b/parea/cache/__init__.py index f309ee48..5373c7dd 100644 --- a/parea/cache/__init__.py +++ b/parea/cache/__init__.py @@ -1,2 +1,2 @@ -from .redis import RedisCache from .in_memory import InMemoryCache +from .redis import RedisCache diff --git a/parea/cache/cache.py b/parea/cache/cache.py index a5779343..224fc8cb 100644 --- a/parea/cache/cache.py +++ b/parea/cache/cache.py @@ -1,4 +1,4 @@ -from typing import Optional, List +from typing import List, Optional from abc import ABC diff --git a/parea/cache/in_memory.py b/parea/cache/in_memory.py index c45cc1ba..46395e6d 100644 --- a/parea/cache/in_memory.py +++ b/parea/cache/in_memory.py @@ -1,5 +1,6 @@ +from typing import List, Optional + import json -from typing import Optional, List from attr import asdict diff --git a/parea/client.py b/parea/client.py index 26d2260f..dd6d0978 100644 --- a/parea/client.py +++ b/parea/client.py @@ -7,8 +7,8 @@ from attrs import asdict, define, field from parea.api_client import HTTPClient +from parea.cache import InMemoryCache, RedisCache from parea.cache.cache import Cache -from parea.cache import RedisCache, InMemoryCache from parea.helpers import gen_trace_id from parea.parea_logger import parea_logger from parea.schemas.models import Completion, CompletionResponse, FeedbackRequest, UseDeployedPrompt, UseDeployedPromptResponse diff --git a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py index 7531ac03..f9cd3377 100644 --- a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py +++ b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py @@ -8,7 +8,7 @@ from attr import asdict from dotenv import load_dotenv -from parea import init, InMemoryCache +from parea import InMemoryCache, init from parea.evals.chat import goal_success_ratio_factory from parea.evals.utils import call_openai from parea.helpers import write_trace_logs_to_csv @@ -41,7 +41,7 @@ def friendliness(log: Log) -> float: def usefulness(log: Log) -> float: - user_input = log.inputs['messages'][-1]['content'] + user_input = log.inputs["messages"][-1]["content"] output = log.output response = call_openai( [ diff --git a/parea/evals/chat.py b/parea/evals/chat.py index ab9889e7..21be2e86 100644 --- a/parea/evals/chat.py +++ b/parea/evals/chat.py @@ -1,14 +1,12 @@ +from typing import Callable, Optional + import json -from typing import Optional, Callable from parea.evals.utils import call_openai from parea.schemas.models import Log -def goal_success_ratio_factory( - use_output: Optional[bool] = False, - message_field: Optional[str] = None -) -> Callable[[Log], float]: +def goal_success_ratio_factory(use_output: Optional[bool] = False, message_field: Optional[str] = None) -> Callable[[Log], float]: """Factory function that returns a function that calculates the goal success ratio of a log. Args: diff --git a/pyproject.toml b/pyproject.toml index 3ef97532..1f9f1f3d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "parea-ai" packages = [{ include = "parea" }] -version = "0.2.19" +version = "0.2.20" description = "Parea python sdk" readme = "README.md" authors = ["joel-parea-ai "]