diff --git a/parea/evals/utils.py b/parea/evals/utils.py index 51be3903..fbcb8a32 100644 --- a/parea/evals/utils.py +++ b/parea/evals/utils.py @@ -128,12 +128,7 @@ def get_tokens(model: str, text: str) -> Union[str, list[int]]: return [] try: encoding = tiktoken.encoding_for_model(model) - tokens = encoding.encode(text) except KeyError: - regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) - text = text.lower() - text = "".join(char for char in text if char not in set(string.punctuation)) - text = re.sub(regex, " ", text) - text = " ".join(text.split()) - tokens = text.split() + encoding = tiktoken.get_encoding("cl100k_base") + tokens = encoding.encode(text) return tokens diff --git a/pyproject.toml b/pyproject.toml index 96699798..4c68c1a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "parea-ai" packages = [{ include = "parea" }] -version = "0.2.36" +version = "0.2.37" description = "Parea python sdk" readme = "README.md" authors = ["joel-parea-ai "]