Skip to content

Commit 2b1aed0

Browse files
lapp0rlouf
authored andcommitted
enable actual-byte tokens in reduced_vocabulary
1 parent 1894fa3 commit 2b1aed0

File tree

2 files changed

+30
-3
lines changed

2 files changed

+30
-3
lines changed

outlines/fsm/regex.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -905,9 +905,15 @@ def reduced_vocabulary(
905905
)
906906

907907
if token_str:
908-
# invalid utf-8 sequences are replaced with � (\ufffd), but there
909-
# might also be tokens specifically for �, ��, ���, etc.
910-
if "\ufffd" in token_str and not re_replacement_seq.match(token):
908+
if isinstance(token, bytes):
909+
# Handle BPE tokenizers where the tokens are directly stored as bytes
910+
# https://github.com/QwenLM/Qwen/blob/main/tokenization_note.md#regular-tokens
911+
token_str = "".join(byte_symbol(b) for b in token)
912+
913+
elif "\ufffd" in token_str and not re_replacement_seq.match(token):
914+
# invalid utf-8 sequences are replaced with � (\ufffd), but there
915+
# might also be tokens specifically for �, ��, ���, etc.
916+
911917
if re_llama_byte_token.match(token):
912918
# llama-like tokenizers have <0xXX> tokens for all
913919
# bytes >= 0x80 and represent all incomplete utf-8

tests/fsm/test_regex.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,8 +714,29 @@ def test_reduced_vocabulary_with_rare_tokens(rare_token):
714714
715715
[1]: https://github.com/dottxt-ai/outlines/pull/763
716716
[2]: https://github.com/dottxt-ai/outlines/pull/948
717+
[3]: https://github.com/dottxt-ai/outlines/pull/1153
717718
"""
718719
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
719720
tokenizer = TransformerTokenizer(tokenizer=tokenizer)
720721
tokenizer.vocabulary[rare_token] = max(tokenizer.vocabulary.values()) + 1
721722
reduced_vocabulary(tokenizer)
723+
724+
725+
def test_reduced_vocabulary_with_byte_tokens():
726+
class MockTokenizer:
727+
vocabulary = {
728+
"string": 1,
729+
b"\xa1": 2, # Qwen-Style
730+
"eos": 3,
731+
}
732+
special_tokens = {"eos"}
733+
eos_token_id = 3
734+
735+
def convert_token_to_string(self, token):
736+
return b"\xef\xbf\xbd".decode()
737+
738+
reduced_vocab = reduced_vocabulary(MockTokenizer())
739+
740+
# See fsm.regex.get_token_transition_keys()
741+
# FSM transition keys represents bytes as <null_prefix><hex_byte>
742+
assert reduced_vocab[0][1][0] == "\x00A1"

0 commit comments

Comments
 (0)