Merge pull request #252 from parea-ai/PAI-446-add-eval-metric-cookbook

feat: update cookbook for chat
parea-ai · Dec 4, 2023 · 615f1c4 · 615f1c4
2 parents b134eb1 + 4719496
commit 615f1c4
Show file tree

Hide file tree

Showing 9 changed files with 154 additions and 94 deletions.
diff --git a/parea/__init__.py b/parea/__init__.py
@@ -12,7 +12,7 @@
 from importlib import metadata as importlib_metadata
 
 from parea.benchmark import run_benchmark
-from parea.cache import RedisCache
+from parea.cache import InMemoryCache, RedisCache
 from parea.client import Parea, init
 
 

diff --git a/parea/cache/__init__.py b/parea/cache/__init__.py
@@ -1 +1,2 @@
+from .in_memory import InMemoryCache
 from .redis import RedisCache
diff --git a/parea/cache/cache.py b/parea/cache/cache.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import List, Optional
 
 from abc import ABC
 
@@ -81,3 +81,25 @@ async def ainvalidate(self, key: CacheRequest):
         # noqa: DAR401
         """
         raise NotImplementedError
+
+    def log(self, value: TraceLog):
+        """
+        Log a response in the cache.
+
+        Args:
+            value (TraceLog): The response to log.
+
+        # noqa: DAR401
+        """
+        raise NotImplementedError
+
+    def read_logs(self) -> List[TraceLog]:
+        """
+        Read all logs from the cache.
+
+        Returns:
+            List[TraceLog]: All logs in the cache.
+
+        # noqa: DAR401
+        """
+        raise NotImplementedError
diff --git a/parea/cache/in_memory.py b/parea/cache/in_memory.py
@@ -0,0 +1,40 @@
+from typing import List, Optional
+
+import json
+
+from attr import asdict
+
+from parea.cache.cache import Cache
+from parea.schemas.models import CacheRequest, TraceLog
+
+
+class InMemoryCache(Cache):
+    def __init__(self):
+        self.cache = {}
+        self.logs = []
+
+    def get(self, key: CacheRequest) -> Optional[TraceLog]:
+        return self.cache.get(json.dumps(asdict(key)))
+
+    async def aget(self, key: CacheRequest) -> Optional[TraceLog]:
+        return self.get(key)
+
+    def set(self, key: CacheRequest, value: TraceLog):
+        self.cache[json.dumps(asdict(key))] = value
+
+    async def aset(self, key: CacheRequest, value: TraceLog):
+        self.set(key, value)
+
+    def invalidate(self, key: CacheRequest):
+        key = json.dumps(asdict(key))
+        if key in self.cache:
+            del self.cache[key]
+
+    async def ainvalidate(self, key: CacheRequest):
+        self.invalidate(key)
+
+    def log(self, value: TraceLog):
+        self.logs.append(value)
+
+    def read_logs(self) -> List[TraceLog]:
+        return self.logs.copy()
diff --git a/parea/client.py b/parea/client.py
@@ -7,8 +7,8 @@
 from attrs import asdict, define, field
 
 from parea.api_client import HTTPClient
+from parea.cache import InMemoryCache, RedisCache
 from parea.cache.cache import Cache
-from parea.cache.redis import RedisCache
 from parea.helpers import gen_trace_id
 from parea.parea_logger import parea_logger
 from parea.schemas.models import Completion, CompletionResponse, FeedbackRequest, UseDeployedPrompt, UseDeployedPromptResponse
@@ -30,7 +30,7 @@ def __attrs_post_init__(self):
         self._client.set_api_key(self.api_key)
         if self.api_key:
             parea_logger.set_client(self._client)
-        if isinstance(self.cache, RedisCache):
+        if isinstance(self.cache, (RedisCache, InMemoryCache)):
             parea_logger.set_redis_cache(self.cache)
         _init_parea_wrapper(logger_all_possible, self.cache)
 

diff --git a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py
@@ -5,10 +5,14 @@
 import time
 
 import openai
+from attr import asdict
 from dotenv import load_dotenv
 
-from parea import RedisCache, init
+from parea import InMemoryCache, init
+from parea.evals.chat import goal_success_ratio_factory
+from parea.evals.utils import call_openai
 from parea.helpers import write_trace_logs_to_csv
+from parea.schemas.models import Log
 from parea.utils.trace_utils import get_current_trace_id, trace
 
 load_dotenv()
@@ -17,52 +21,13 @@
 
 
 use_cache = True
-cache = RedisCache() if use_cache else None
+cache = InMemoryCache() if use_cache else None
 init(api_key=os.getenv("PAREA_API_KEY"), cache=cache)
 
 
-def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float = 0.0) -> str:
-    return openai.ChatCompletion.create(model=model, temperature=temperature, messages=data).choices[0].message["content"]
-
-
-def goal_success_ratio(inputs: Dict, output: str, target: str = None) -> float:
-    """Returns the average amount of turns the user had to converse with the AI to reach their goals."""
-    output = json.loads(output)
-    # need to determine where does a new goal start
-    conversation_segments = []
-    start_index = 0
-    end_index = 3
-    while end_index < len(output):
-        user_follows_same_goal = call_llm(
-            [
-                {
-                    "role": "system",
-                    "content": "Look at the conversation and to determine if the user is still following the same goal "
-                    "or if they are following a new goal. If they are following the same goal, respond "
-                    "SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!",
-                }
-            ]
-            + output[start_index:end_index],
-            model="gpt-4",
-        )
-
-        if user_follows_same_goal == "SAME_GOAL":
-            end_index += 2
-        else:
-            conversation_segments.append(output[start_index : end_index - 1])
-            start_index = end_index - 1
-            end_index += 2
-
-    if start_index < len(output):
-        conversation_segments.append(output[start_index:])
-
-    # for now assume that the user reached their goal in every segment
-    # so we can return the average amount of turns the user had to converse with the AI to reach their goals
-    return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments)
-
-
-def friendliness(inputs: Dict, output: str, target: str = None) -> float:
-    response = call_llm(
+def friendliness(log: Log) -> float:
+    output = log.output
+    response = call_openai(
         [
             {"role": "system", "content": "You evaluate the friendliness of the following response on a scale of 0 to 10. You must only return a number."},
             {"role": "assistant", "content": output},
@@ -75,9 +40,10 @@ def friendliness(inputs: Dict, output: str, target: str = None) -> float:
         return 0.0
 
 
-def usefulness(inputs: Dict, output: str, target: str = None) -> float:
-    user_input = inputs["messages"][-1]["content"]
-    response = call_llm(
+def usefulness(log: Log) -> float:
+    user_input = log.inputs["messages"][-1]["content"]
+    output = log.output
+    response = call_openai(
         [
             {"role": "system", "content": "You evaluate the usefulness of the response given the user input on a scale of 0 to 10. You must only return a number."},
             {"role": "assistant", "content": f'''User input: "{user_input}"\nAssistant response: "{output}"'''},
@@ -92,15 +58,15 @@ def usefulness(inputs: Dict, output: str, target: str = None) -> float:
 
 @trace(eval_funcs=[friendliness, usefulness])
 def helpful_the_second_time(messages: List[Dict[str, str]]) -> str:
-    helpful_response = call_llm(
+    helpful_response = call_openai(
         [
             {"role": "system", "content": "You are a friendly, and helpful assistant that helps people with their homework."},
         ]
         + messages,
         model="gpt-4",
     )
 
-    has_user_asked_before_raw = call_llm(
+    has_user_asked_before_raw = call_openai(
         [
             {
                 "role": "system",
@@ -117,7 +83,7 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str:
         messages.append({"role": "assistant", "content": helpful_response})
         return helpful_response
     else:
-        unhelfpul_response = call_llm(
+        unhelfpul_response = call_openai(
             [
                 {
                     "role": "system",
@@ -134,9 +100,12 @@ def helpful_the_second_time(messages: List[Dict[str, str]]) -> str:
         return unhelfpul_response
 
 
+goal_success_ratio = goal_success_ratio_factory(use_output=True)
+
+
 @trace(eval_funcs=[goal_success_ratio], access_output_of_func=lambda x: x[0])
 def unhelpful_chat():
-    print("Welcome to the chat! Type 'exit' to end the session.")
+    print("\nWelcome to the somewhat helpful chat! Type 'exit' to end the session.")
 
     trace_id = get_current_trace_id()
 
@@ -164,14 +133,14 @@ def main():
         path_csv = f"trace_logs-{int(time.time())}.csv"
         trace_logs = cache.read_logs()
         write_trace_logs_to_csv(path_csv, trace_logs)
-        print(f"CSV-file of results: {path_csv}")
+        print(f"\nCSV-file of traces: {path_csv}")
         parent_trace = None
         for trace_log in trace_logs:
             if trace_log.trace_id == trace_id:
                 parent_trace = trace_log
                 break
         if parent_trace:
-            print(f"Overall score(s):\n{json.dumps(parent_trace.scores)}")
+            print(f"Overall score(s):\n{json.dumps(parent_trace.scores, default=asdict, indent=2)}")
 
 
 if __name__ == "__main__":

diff --git a/parea/evals/chat.py b/parea/evals/chat.py
@@ -1,40 +1,64 @@
+from typing import Callable, Optional
+
+import json
+
 from parea.evals.utils import call_openai
 from parea.schemas.models import Log
 
 
-def goal_success_ratio(log: Log) -> float:
-    """Returns the average amount of turns the user had to converse with the AI to reach their goals."""
-    messages = [m.to_dict() for m in log.configuration.messages]
-    messages.append({"role": "assistant", "content": log.output})
-
-    # need to determine where does a new goal start
-    conversation_segments = []
-    start_index = 0
-    end_index = 3
-    while end_index < len(messages):
-        user_follows_same_goal = call_openai(
-            [
-                {
-                    "role": "system",
-                    "content": "Look at the conversation and to determine if the user is still following the same goal "
-                    "or if they are following a new goal. If they are following the same goal, respond "
-                    "SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!",
-                }
-            ]
-            + messages[start_index:end_index],
-            model="gpt-4",
-        )
-
-        if user_follows_same_goal == "SAME_GOAL":
-            end_index += 2
+def goal_success_ratio_factory(use_output: Optional[bool] = False, message_field: Optional[str] = None) -> Callable[[Log], float]:
+    """Factory function that returns a function that calculates the goal success ratio of a log.
+
+    Args:
+        use_output (Optional[bool], optional): Whether to use the output of the log to access the messages. Defaults to False.
+        message_field (Optional[str], optional): The name of the field in the log that contains the messages.
+            Defaults to None. If None, the messages are taken from the configuration attribute.
+    """
+    if use_output and message_field:
+        raise ValueError("Only one of use_output and message_field can be set.")
+
+    def goal_success_ratio(log: Log) -> float:
+        """Returns the average amount of turns the user had to converse with the AI to reach their goals."""
+        if use_output:
+            output_list_dicts = json.loads(log.output)
+            messages = [m for m in output_list_dicts]
+        elif message_field:
+            messages = [m for m in log.inputs[message_field]]
         else:
-            conversation_segments.append(messages[start_index : end_index - 1])
-            start_index = end_index - 1
-            end_index += 2
+            messages = [m.to_dict() for m in log.configuration.messages]
+            if log.output:
+                messages.append({"role": "assistant", "content": log.output})
+
+        # need to determine where does a new goal start
+        conversation_segments = []
+        start_index = 0
+        end_index = 3
+        while end_index < len(messages):
+            user_follows_same_goal = call_openai(
+                [
+                    {
+                        "role": "system",
+                        "content": "Look at the conversation and to determine if the user is still following the same goal "
+                        "or if they are following a new goal. If they are following the same goal, respond "
+                        "SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!",
+                    }
+                ]
+                + messages[start_index:end_index],
+                model="gpt-4",
+            )
+
+            if user_follows_same_goal == "SAME_GOAL":
+                end_index += 2
+            else:
+                conversation_segments.append(messages[start_index : end_index - 1])
+                start_index = end_index - 1
+                end_index += 2
+
+        if start_index < len(messages):
+            conversation_segments.append(messages[start_index:])
 
-    if start_index < len(messages):
-        conversation_segments.append(messages[start_index:])
+        # for now assume that the user reached their goal in every segment
+        # return the average amount of turns the user had to converse with the AI to reach their goals
+        return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments)
 
-    # for now assume that the user reached their goal in every segment
-    # return the average amount of turns the user had to converse with the AI to reach their goals
-    return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments)
+    return goal_success_ratio
diff --git a/parea/utils/trace_utils.py b/parea/utils/trace_utils.py
@@ -179,23 +179,27 @@ def logger_all_possible(trace_id: str):
 def call_eval_funcs_then_log(trace_id: str, eval_funcs: list[Callable] = None, access_output_of_func: Callable = None):
     data = trace_data.get()[trace_id]
     try:
-        inputs = data.inputs
-        target = data.target
         if eval_funcs and data.status == "success":
             if access_output_of_func:
                 output = json.loads(data.output)
                 output = access_output_of_func(output)
                 output_for_eval_metrics = json_dumps(output)
             else:
                 output_for_eval_metrics = data.output
+
             data.output_for_eval_metrics = output_for_eval_metrics
+            output_old = data.output
+            data.output = data.output_for_eval_metrics
             data.scores = []
+
             for func in eval_funcs:
                 try:
-                    score = func(inputs=inputs, output=output_for_eval_metrics, target=target)
+                    score = func(data)
                     data.scores.append(NamedEvaluationScore(name=func.__name__, score=score))
                 except Exception as e:
                     logger.exception(f"Error occurred calling evaluation function '{func.__name__}', {e}", exc_info=e)
+
+            data.output = output_old
     except Exception as e:
         logger.exception(f"Error occurred in when trying to evaluate output, {e}", exc_info=e)
     parea_logger.default_log(data=data)

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "parea-ai"
 packages = [{ include = "parea" }]
-version = "0.2.19"
+version = "0.2.20"
 description = "Parea python sdk"
 readme = "README.md"
 authors = ["joel-parea-ai <[email protected]>"]