Merge pull request #445 from parea-ai/fix-num-tokens

fix(encode): if error return 0
parea-ai · Feb 12, 2024 · d022ad7 · d022ad7
2 parents 35c5ee1 + 728ef93
commit d022ad7
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 15 deletions.
diff --git a/parea/cookbook/tracing_without_deployed_prompt.py b/parea/cookbook/tracing_without_deployed_prompt.py
@@ -4,7 +4,7 @@
 from dotenv import load_dotenv
 
 from parea import Parea, get_current_trace_id, trace
-from parea.schemas import Completion, CompletionResponse, FeedbackRequest, LLMInputs, Message, ModelParams
+from parea.schemas import Completion, CompletionResponse, LLMInputs, Message, ModelParams, FeedbackRequest
 
 load_dotenv()
 
@@ -14,7 +14,7 @@
 @trace  # <--- If you want to log the inputs to the LLM call you can optionally add a trace decorator here
 def call_llm(
     data: list[dict],
-    model: str = "gpt-3.5-turbo",
+    model: str = "gpt-3.5-turbo-1106",
     provider: str = "openai",
     temperature: float = 0.0,
 ) -> CompletionResponse:

diff --git a/parea/evals/utils.py b/parea/evals/utils.py
@@ -12,6 +12,7 @@
 from parea.parea_logger import parea_logger
 from parea.schemas.log import Log
 from parea.schemas.models import NamedEvaluationScore, UpdateLog
+from parea.wrapper.utils import _safe_encode
 
 seg = pysbd.Segmenter(language="en", clean=False)
 
@@ -135,5 +136,5 @@ def get_tokens(model: str, text: str) -> Union[str, list[int]]:
         encoding = tiktoken.encoding_for_model(model)
     except KeyError:
         encoding = tiktoken.get_encoding("cl100k_base")
-    tokens = encoding.encode(text)
+    tokens = _safe_encode(encoding, text)
     return tokens
diff --git a/parea/wrapper/utils.py b/parea/wrapper/utils.py
@@ -41,6 +41,14 @@ def wrapper(*args, **kwargs):
     return decorator_wrapper
 
 
+def _safe_encode(encoding, text):
+    try:
+        return len(encoding.encode(text))
+    except Exception as e:
+        print(f"Error encoding text: {e}")
+        return 0
+
+
 def _num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613", is_azure: bool = False):
     """Return the number of tokens used by a list of messages.
     source: https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
@@ -86,7 +94,7 @@ def _num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613", is_azure: bo
     for message in messages:
         num_tokens += tokens_per_message
         for key, value in message.items():
-            num_tokens += len(encoding.encode(value))
+            num_tokens += _safe_encode(encoding, value)
             if key == "name":
                 num_tokens += tokens_per_name
     num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
@@ -107,37 +115,41 @@ def _num_tokens_from_functions(functions, function_call, model="gpt-3.5-turbo-06
 
     num_tokens = 3 if model in ["gpt-3.5-turbo-1106", "gpt-4-1106-preview"] else 0
     for function in functions:
-        function_tokens = len(encoding.encode(function.get("name", "")))
-        function_tokens += len(encoding.encode(function.get("description", "")))
+        try:
+            function_tokens = _safe_encode(encoding, function.get("name", ""))
+            function_tokens += _safe_encode(encoding, function.get("description", ""))
+        except Exception as e:
+            print(f"Error counting tokens: {e}")
+            function_tokens = 0
 
         if "parameters" in function:
             parameters = function["parameters"]
             if "properties" in parameters:
                 for propertiesKey in parameters["properties"]:
-                    function_tokens += len(encoding.encode(propertiesKey))
+                    function_tokens += _safe_encode(encoding, propertiesKey)
                     v = parameters["properties"][propertiesKey]
                     for field in v:
                         if field == "type":
                             function_tokens += 2
-                            function_tokens += len(encoding.encode(v["type"]))
+                            function_tokens += _safe_encode(encoding, v["type"])
                         elif field == "description":
                             function_tokens += 2
-                            function_tokens += len(encoding.encode(v["description"]))
+                            function_tokens += _safe_encode(encoding, v["description"])
                         elif field == "enum":
                             function_tokens -= 3
                             for o in v["enum"]:
                                 function_tokens += 3
-                                function_tokens += len(encoding.encode(o))
+                                function_tokens += _safe_encode(encoding, o)
                         else:
                             print(f"Warning: not supported field {field}")
                 function_tokens += 11
 
         num_tokens += function_tokens
 
     num_tokens += 10
-    function_call_tokens = len(encoding.encode("auto")) - 1
+    function_call_tokens = min(_safe_encode(encoding, "auto") - 1, 0)
     if isinstance(function_call, dict):
-        function_call_tokens = len(encoding.encode(json_dumps(function_call))) - 1
+        function_call_tokens = min(_safe_encode(encoding, json_dumps(function_call)) - 1, 0)
     return num_tokens + function_call_tokens
 
 
@@ -148,8 +160,7 @@ def _num_tokens_from_string(string: str, model_name: str = "gpt-3.5-turbo") -> i
     except KeyError:
         print(f"Warning: model {model_name} not found. Using cl100k_base encoding.")
         encoding = tiktoken.get_encoding("cl100k_base")
-    num_tokens = len(encoding.encode(string))
-    return num_tokens
+    return _safe_encode(encoding, string)
 
 
 def _calculate_input_tokens(

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "parea-ai"
 packages = [{ include = "parea" }]
-version = "0.2.66"
+version = "0.2.67"
 description = "Parea python sdk"
 readme = "README.md"
 authors = ["joel-parea-ai <[email protected]>"]