Merge pull request #155 from parea-ai/PAI-335-add-eval-metrics-to-pyt…

…hon-sdk-trace-decorator-local-benchmark Pai 335 add eval metrics to python sdk trace decorator local benchmark
parea-ai · Oct 20, 2023 · 568186e · 568186e
2 parents bffcada + 68944a4
commit 568186e
Show file tree

Hide file tree

Showing 11 changed files with 297 additions and 32 deletions.
diff --git a/parea/benchmark.py b/parea/benchmark.py
@@ -13,6 +13,7 @@
 from tqdm import tqdm
 
 from parea.cache.redis import RedisCache
+from parea.helpers import write_trace_logs_to_csv
 from parea.schemas.models import TraceLog
 
 
@@ -50,8 +51,8 @@ def async_wrapper(fn, **kwargs):
 
 def run_benchmark(args):
     parser = argparse.ArgumentParser()
-    parser.add_argument("--func", help="Function to test e.g., path/to/my_code.py:argument_chain", type=str)
-    parser.add_argument("--csv_path", help="Path to the input CSV file", type=str)
+    parser.add_argument("--func", help="Function to test e.g., path/to/my_code.py:argument_chain", type=str, required=True)
+    parser.add_argument("--csv_path", help="Path to the input CSV file", type=str, required=True)
     parser.add_argument("--redis_host", help="Redis host", type=str, default=os.getenv("REDIS_HOST", "localhost"))
     parser.add_argument("--redis_port", help="Redis port", type=int, default=int(os.getenv("REDIS_PORT", 6379)))
     parser.add_argument("--redis_password", help="Redis password", type=str, default=None)
@@ -69,23 +70,14 @@ def run_benchmark(args):
             futures = [executor.submit(async_wrapper, fn, **data_input) for data_input in data_inputs]
         else:
             futures = [executor.submit(fn, **data_input) for data_input in data_inputs]
-        for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+        for _ in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
             pass
         print(f"Done with {len(futures)} inputs")
 
-        redis_cache = RedisCache(key_logs=redis_logs_key)
-
-        trace_logs: list[TraceLog] = redis_cache.read_logs()
+        redis_cache = RedisCache(key_logs=redis_logs_key, host=args.redis_host, port=args.redis_port, password=args.redis_password)
 
         # write to csv
         path_csv = f"trace_logs-{int(time.time())}.csv"
-        with open(path_csv, "w", newline="") as file:
-            # write header
-            columns = fields_dict(TraceLog).keys()
-            writer = csv.DictWriter(file, fieldnames=columns)
-            writer.writeheader()
-            # write rows
-            for trace_log in trace_logs:
-                writer.writerow(asdict(trace_log))
-
+        trace_logs: list[TraceLog] = redis_cache.read_logs()
+        write_trace_logs_to_csv(path_csv, trace_logs)
         print(f"Wrote CSV of results to: {path_csv}")
diff --git a/parea/client.py b/parea/client.py
@@ -31,7 +31,7 @@ def __attrs_post_init__(self):
         if self.api_key:
             parea_logger.set_client(self._client)
         if isinstance(self.cache, RedisCache):
-            parea_logger.set_redis_lru_cache(self.cache)
+            parea_logger.set_redis_cache(self.cache)
         _init_parea_wrapper(logger_all_possible, self.cache)
 
     def completion(self, data: Completion) -> CompletionResponse:

diff --git a/parea/cookbook/tracing_and_evaluating_openai_endpoint.py b/parea/cookbook/tracing_and_evaluating_openai_endpoint.py
@@ -0,0 +1,178 @@
+from typing import Dict, List
+
+import json
+import os
+import time
+
+import openai
+from dotenv import load_dotenv
+
+from parea import RedisCache, init
+from parea.helpers import write_trace_logs_to_csv
+from parea.utils.trace_utils import get_current_trace_id, trace
+
+load_dotenv()
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+use_cache = True
+cache = RedisCache() if use_cache else None
+init(api_key=os.getenv("PAREA_API_KEY"), cache=cache)
+
+
+def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float = 0.0) -> str:
+    return openai.ChatCompletion.create(model=model, temperature=temperature, messages=data).choices[0].message["content"]
+
+
+def goal_success_ratio(inputs: Dict, output: str, target: str = None) -> float:
+    """Returns the average amount of turns the user had to converse with the AI to reach their goals."""
+    output = json.loads(output)
+    # need to determine where does a new goal start
+    conversation_segments = []
+    start_index = 0
+    end_index = 3
+    while end_index < len(output):
+        user_follows_same_goal = call_llm(
+            [
+                {
+                    "role": "system",
+                    "content": "Look at the conversation and to determine if the user is still following the same goal "
+                    "or if they are following a new goal. If they are following the same goal, respond "
+                    "SAME_GOAL. Otherwise, respond NEW_GOAL. In any case do not answer the user request!",
+                }
+            ]
+            + output[start_index:end_index],
+            model="gpt-4",
+        )
+
+        if user_follows_same_goal == "SAME_GOAL":
+            end_index += 2
+        else:
+            conversation_segments.append(output[start_index : end_index - 1])
+            start_index = end_index - 1
+            end_index += 2
+
+    if start_index < len(output):
+        conversation_segments.append(output[start_index:])
+
+    # for now assume that the user reached their goal in every segment
+    # so we can return the average amount of turns the user had to converse with the AI to reach their goals
+    return sum([2 / len(segment) for segment in conversation_segments]) / len(conversation_segments)
+
+
+def friendliness(inputs: Dict, output: str, target: str = None) -> float:
+    response = call_llm(
+        [
+            {"role": "system", "content": "You evaluate the friendliness of the following response on a scale of 0 to 10. You must only return a number."},
+            {"role": "assistant", "content": output},
+        ],
+        model="gpt-4",
+    )
+    try:
+        return float(response) / 10.0
+    except TypeError:
+        return 0.0
+
+
+def usefulness(inputs: Dict, output: str, target: str = None) -> float:
+    user_input = inputs["messages"][-1]["content"]
+    response = call_llm(
+        [
+            {"role": "system", "content": "You evaluate the usefulness of the response given the user input on a scale of 0 to 10. You must only return a number."},
+            {"role": "assistant", "content": f'''User input: "{user_input}"\nAssistant response: "{output}"'''},
+        ],
+        model="gpt-4",
+    )
+    try:
+        return float(response) / 10.0
+    except TypeError:
+        return 0.0
+
+
+@trace(eval_funcs=[friendliness, usefulness])
+def helpful_the_second_time(messages: List[Dict[str, str]]) -> str:
+    helpful_response = call_llm(
+        [
+            {"role": "system", "content": "You are a friendly, and helpful assistant that helps people with their homework."},
+        ]
+        + messages,
+        model="gpt-4",
+    )
+
+    has_user_asked_before_raw = call_llm(
+        [
+            {
+                "role": "system",
+                "content": "Assess if the user has asked the last question before or is asking again for more \
+information on a previous topic. If so, respond ASKED_BEFORE. Otherwise, respond NOT_ASKED_BEFORE.",
+            }
+        ]
+        + messages,
+        model="gpt-4",
+    )
+    has_user_asked_before = has_user_asked_before_raw == "ASKED_BEFORE"
+
+    if has_user_asked_before:
+        messages.append({"role": "assistant", "content": helpful_response})
+        return helpful_response
+    else:
+        unhelfpul_response = call_llm(
+            [
+                {
+                    "role": "system",
+                    "content": "Given the helpful response to the user input below, please provide a slightly unhelpful \
+    response which makes the user ask again in case they didn't ask already again because of a previous unhelpful answer. \
+    In case the user asked again, please provide a last response",
+                },
+            ]
+            + messages
+            + [{"role": "assistant", "content": helpful_response}],
+            model="gpt-4",
+        )
+        messages.append({"role": "assistant", "content": unhelfpul_response})
+        return unhelfpul_response
+
+
+@trace(eval_funcs=[goal_success_ratio], access_output_of_func=lambda x: x[0])
+def unhelpful_chat():
+    print("Welcome to the chat! Type 'exit' to end the session.")
+
+    trace_id = get_current_trace_id()
+
+    messages = []
+    while True:
+        user_input = input("\nYou: ")
+
+        if user_input.lower() == "exit":
+            print("Goodbye!")
+            break
+
+        messages.append({"role": "user", "content": user_input})
+        print("Bot:", helpful_the_second_time(messages))
+
+    return messages, trace_id
+
+
+def main():
+    _, trace_id = unhelpful_chat()
+
+    if os.getenv("PAREA_API_KEY"):
+        print(f"You can view the logs at: https://optimusprompt.ai/logs/detailed/{trace_id}")
+    if use_cache:
+        time.sleep(5)  # wait for local eval function to finish
+        path_csv = f"trace_logs-{int(time.time())}.csv"
+        trace_logs = cache.read_logs()
+        write_trace_logs_to_csv(path_csv, trace_logs)
+        print(f"CSV-file of results: {path_csv}")
+        parent_trace = None
+        for trace_log in trace_logs:
+            if trace_log.trace_id == trace_id:
+                parent_trace = trace_log
+                break
+        if parent_trace:
+            print(f"Overall score(s):\n{json.dumps(parent_trace.scores)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/parea/cookbook/tracing_with_open_ai_endpoint_directly.py b/parea/cookbook/tracing_with_open_ai_endpoint_directly.py
@@ -1,4 +1,7 @@
+from typing import Dict, Optional
+
 import os
+import random
 from datetime import datetime
 
 import openai
@@ -19,7 +22,12 @@ def call_llm(data: list[dict], model: str = "gpt-3.5-turbo", temperature: float
     return openai.ChatCompletion.create(model=model, temperature=temperature, messages=data).choices[0].message["content"]
 
 
-@trace
+def random_eval(inputs: Dict[str, str], output, target: Optional[str] = None) -> float:
+    # return random number between 0 and 1
+    return random.random()
+
+
+@trace(eval_funcs=[random_eval])
 def argumentor(query: str, additional_description: str = "") -> str:
     return call_llm(
         [
@@ -48,7 +56,7 @@ def critic(argument: str) -> str:
     )
 
 
-@trace
+@trace(eval_funcs=[random_eval])
 def refiner(query: str, additional_description: str, argument: str, criticism: str) -> str:
     return call_llm(
         [
@@ -68,7 +76,7 @@ def refiner(query: str, additional_description: str, argument: str, criticism: s
     )
 
 
-@trace
+@trace(eval_funcs=[random_eval], access_output_of_func=lambda x: x[0])
 def argument_chain(query: str, additional_description: str = "") -> tuple[str, str]:
     trace_id = get_current_trace_id()
     argument = argumentor(query, additional_description)

diff --git a/parea/helpers.py b/parea/helpers.py
@@ -1,6 +1,11 @@
+import csv
 import time
 import uuid
 
+from attr import asdict, fields_dict
+
+from parea.schemas.models import TraceLog
+
 
 def gen_trace_id() -> str:
     """Generate a unique trace id for each chain of requests"""
@@ -13,3 +18,14 @@ def to_date_and_time_string(timestamp: float) -> str:
 
 def date_and_time_string_to_timestamp(date_and_time_string: str) -> float:
     return time.mktime(time.strptime(date_and_time_string, "%Y-%m-%d %H:%M:%S %Z"))
+
+
+def write_trace_logs_to_csv(path_csv: str, trace_logs: list[TraceLog]):
+    with open(path_csv, "w", newline="") as file:
+        # write header
+        columns = fields_dict(TraceLog).keys()
+        writer = csv.DictWriter(file, fieldnames=columns)
+        writer.writeheader()
+        # write rows
+        for trace_log in trace_logs:
+            writer.writerow(asdict(trace_log))
diff --git a/parea/parea_logger.py b/parea/parea_logger.py
@@ -10,13 +10,13 @@
 @define
 class PareaLogger:
     _client: HTTPClient = field(init=False, default=None)
-    _redis_lru_cache: RedisCache = field(init=False, default=None)
+    _redis_cache: RedisCache = field(init=False, default=None)
 
     def set_client(self, client: HTTPClient) -> None:
         self._client = client
 
-    def set_redis_lru_cache(self, cache: RedisCache) -> None:
-        self._redis_lru_cache = cache
+    def set_redis_cache(self, cache: RedisCache) -> None:
+        self._redis_cache = cache
 
     def record_log(self, data: TraceLog) -> None:
         self._client.request(
@@ -33,10 +33,10 @@ async def arecord_log(self, data: TraceLog) -> None:
         )
 
     def write_log(self, data: TraceLog) -> None:
-        self._redis_lru_cache.log(data)
+        self._redis_cache.log(data)
 
     def default_log(self, data: TraceLog) -> None:
-        if self._redis_lru_cache:
+        if self._redis_cache:
             self.write_log(data)
         if self._client:
             self.record_log(data)

diff --git a/parea/schemas/models.py b/parea/schemas/models.py
@@ -109,6 +109,12 @@ class FeedbackRequest:
     target: Optional[str] = None
 
 
+@define
+class NamedEvaluationScore:
+    name: str
+    score: float = field(validator=[validators.ge(0), validators.le(1)])
+
+
 @define
 class TraceLog:
     trace_id: str
@@ -119,14 +125,16 @@ class TraceLog:
     error: Optional[str] = None
     status: Optional[str] = None
     deployment_id: Optional[str] = None
-    evaluation_metric_ids: Optional[list[int]] = None
     cache_hit: bool = False
     configuration: LLMInputs = LLMInputs()
     latency: Optional[float] = 0.0
     input_tokens: Optional[int] = 0
     output_tokens: Optional[int] = 0
     total_tokens: Optional[int] = 0
     cost: Optional[float] = 0.0
+    output_for_eval_metrics: Optional[str] = None
+    evaluation_metric_names: Optional[list[str]] = None
+    scores: Optional[list[NamedEvaluationScore]] = None
     feedback_score: Optional[float] = None
 
     # info filled from decorator