openai · LeSingh1 · May 24, 2026
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
@@ -110,6 +110,29 @@ def test_encode_surrogate_pairs():
     assert enc.encode("\ud83d") == enc.encode("�")
 
 
+def test_encode_with_unstable_surrogate_pairs():
+    # Regression for openai/tiktoken#541: encode_with_unstable previously
+    # raised UnicodeEncodeError for surrogate pairs / lone surrogates, even
+    # though encode and encode_ordinary already handled them by retrying
+    # after a UTF-16 "surrogatepass" / "replace" round-trip. Mirror the
+    # same behavior so the three encode paths agree on what inputs they
+    # accept.
+    enc = tiktoken.get_encoding("cl100k_base")
+
+    # Surrogate pair must not raise. The stable prefix returned by
+    # encode_with_unstable matches the prefix of encode() on the repaired
+    # text.
+    stable_pair, _ = enc.encode_with_unstable("👍")
+    expected_pair = enc.encode("👍")
+    assert expected_pair[: len(stable_pair)] == stable_pair
+
+    # Lone surrogate must also not raise. encode() already maps it through
+    # "replace" to U+FFFD, so encode_with_unstable should agree.
+    stable_lone, _ = enc.encode_with_unstable("\ud83d")
+    expected_lone = enc.encode("\ud83d")
+    assert expected_lone[: len(stable_lone)] == stable_lone
+
+
 @pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
 def test_catastrophically_repetitive(make_enc: Callable[[], tiktoken.Encoding]):
     enc = make_enc()

diff --git a/tiktoken/core.py b/tiktoken/core.py
@@ -240,7 +240,15 @@ def encode_with_unstable(
             if match := _special_token_regex(disallowed_special).search(text):
                 raise_disallowed_special_token(match.group())
 
-        return self._core_bpe.encode_with_unstable(text, allowed_special)
+        try:
+            return self._core_bpe.encode_with_unstable(text, allowed_special)
+        except UnicodeEncodeError:
+            # See comment in encode -- BPE operates on bytes so invalid UTF-8 (e.g. surrogate
+            # pairs or lone surrogates that survived in a Python str) cannot reach Rust as-is.
+            # Repair by round-tripping through UTF-16 with "surrogatepass" + "replace" so the
+            # behavior matches encode / encode_ordinary instead of raising UnicodeEncodeError.
+            text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace")
+            return self._core_bpe.encode_with_unstable(text, allowed_special)
 
     def encode_single_token(self, text_or_bytes: str | bytes) -> int:
         """Encodes text corresponding to a single token to its token value.