diff --git a/tests/test_encoding.py b/tests/test_encoding.py index b77ca135..af399f0f 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -110,6 +110,29 @@ def test_encode_surrogate_pairs(): assert enc.encode("\ud83d") == enc.encode("�") +def test_encode_with_unstable_surrogate_pairs(): + # Regression for openai/tiktoken#541: encode_with_unstable previously + # raised UnicodeEncodeError for surrogate pairs / lone surrogates, even + # though encode and encode_ordinary already handled them by retrying + # after a UTF-16 "surrogatepass" / "replace" round-trip. Mirror the + # same behavior so the three encode paths agree on what inputs they + # accept. + enc = tiktoken.get_encoding("cl100k_base") + + # Surrogate pair must not raise. The stable prefix returned by + # encode_with_unstable matches the prefix of encode() on the repaired + # text. + stable_pair, _ = enc.encode_with_unstable("👍") + expected_pair = enc.encode("👍") + assert expected_pair[: len(stable_pair)] == stable_pair + + # Lone surrogate must also not raise. encode() already maps it through + # "replace" to U+FFFD, so encode_with_unstable should agree. + stable_lone, _ = enc.encode_with_unstable("\ud83d") + expected_lone = enc.encode("\ud83d") + assert expected_lone[: len(stable_lone)] == stable_lone + + @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES) def test_catastrophically_repetitive(make_enc: Callable[[], tiktoken.Encoding]): enc = make_enc() diff --git a/tiktoken/core.py b/tiktoken/core.py index 530f8f59..461b6d81 100644 --- a/tiktoken/core.py +++ b/tiktoken/core.py @@ -240,7 +240,15 @@ def encode_with_unstable( if match := _special_token_regex(disallowed_special).search(text): raise_disallowed_special_token(match.group()) - return self._core_bpe.encode_with_unstable(text, allowed_special) + try: + return self._core_bpe.encode_with_unstable(text, allowed_special) + except UnicodeEncodeError: + # See comment in encode -- BPE operates on bytes so invalid UTF-8 (e.g. surrogate + # pairs or lone surrogates that survived in a Python str) cannot reach Rust as-is. + # Repair by round-tripping through UTF-16 with "surrogatepass" + "replace" so the + # behavior matches encode / encode_ordinary instead of raising UnicodeEncodeError. + text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace") + return self._core_bpe.encode_with_unstable(text, allowed_special) def encode_single_token(self, text_or_bytes: str | bytes) -> int: """Encodes text corresponding to a single token to its token value.