enable actual-byte tokens in reduced_vocabulary

lapp0 · rlouf · commit 2b1aed0bc446 · 2024-09-17T15:23:32.000+02:00
diff --git a/outlines/fsm/regex.py b/outlines/fsm/regex.py
@@ -905,9 +905,15 @@ def reduced_vocabulary(
         )
 
         if token_str:
-            # invalid utf-8 sequences are replaced with � (\ufffd), but there
-            # might also be tokens specifically for �, ��, ���, etc.
-            if "\ufffd" in token_str and not re_replacement_seq.match(token):
+            if isinstance(token, bytes):
+                # Handle BPE tokenizers where the tokens are directly stored as bytes
+                # https://github.com/QwenLM/Qwen/blob/main/tokenization_note.md#regular-tokens
+                token_str = "".join(byte_symbol(b) for b in token)
+
+            elif "\ufffd" in token_str and not re_replacement_seq.match(token):
+                # invalid utf-8 sequences are replaced with � (\ufffd), but there
+                # might also be tokens specifically for �, ��, ���, etc.
+
                 if re_llama_byte_token.match(token):
                     # llama-like tokenizers have <0xXX> tokens for all
                     # bytes >= 0x80 and represent all incomplete utf-8
diff --git a/tests/fsm/test_regex.py b/tests/fsm/test_regex.py
@@ -714,8 +714,29 @@ def test_reduced_vocabulary_with_rare_tokens(rare_token):
 
     [1]: https://github.com/dottxt-ai/outlines/pull/763
     [2]: https://github.com/dottxt-ai/outlines/pull/948
+    [3]: https://github.com/dottxt-ai/outlines/pull/1153
     """
     tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
     tokenizer = TransformerTokenizer(tokenizer=tokenizer)
     tokenizer.vocabulary[rare_token] = max(tokenizer.vocabulary.values()) + 1
     reduced_vocabulary(tokenizer)
+
+
+def test_reduced_vocabulary_with_byte_tokens():
+    class MockTokenizer:
+        vocabulary = {
+            "string": 1,
+            b"\xa1": 2,  # Qwen-Style
+            "eos": 3,
+        }
+        special_tokens = {"eos"}
+        eos_token_id = 3
+
+        def convert_token_to_string(self, token):
+            return b"\xef\xbf\xbd".decode()
+
+    reduced_vocab = reduced_vocabulary(MockTokenizer())
+
+    # See fsm.regex.get_token_transition_keys()
+    # FSM transition keys represents bytes as <null_prefix><hex_byte>
+    assert reduced_vocab[0][1][0] == "\x00A1"