File tree Expand file tree Collapse file tree 2 files changed +30
-3
lines changed Expand file tree Collapse file tree 2 files changed +30
-3
lines changed Original file line number Diff line number Diff line change @@ -905,9 +905,15 @@ def reduced_vocabulary(
905
905
)
906
906
907
907
if token_str :
908
- # invalid utf-8 sequences are replaced with � (\ufffd), but there
909
- # might also be tokens specifically for �, ��, ���, etc.
910
- if "\ufffd " in token_str and not re_replacement_seq .match (token ):
908
+ if isinstance (token , bytes ):
909
+ # Handle BPE tokenizers where the tokens are directly stored as bytes
910
+ # https://github.com/QwenLM/Qwen/blob/main/tokenization_note.md#regular-tokens
911
+ token_str = "" .join (byte_symbol (b ) for b in token )
912
+
913
+ elif "\ufffd " in token_str and not re_replacement_seq .match (token ):
914
+ # invalid utf-8 sequences are replaced with � (\ufffd), but there
915
+ # might also be tokens specifically for �, ��, ���, etc.
916
+
911
917
if re_llama_byte_token .match (token ):
912
918
# llama-like tokenizers have <0xXX> tokens for all
913
919
# bytes >= 0x80 and represent all incomplete utf-8
Original file line number Diff line number Diff line change @@ -714,8 +714,29 @@ def test_reduced_vocabulary_with_rare_tokens(rare_token):
714
714
715
715
[1]: https://github.com/dottxt-ai/outlines/pull/763
716
716
[2]: https://github.com/dottxt-ai/outlines/pull/948
717
+ [3]: https://github.com/dottxt-ai/outlines/pull/1153
717
718
"""
718
719
tokenizer = AutoTokenizer .from_pretrained ("openai-community/gpt2" )
719
720
tokenizer = TransformerTokenizer (tokenizer = tokenizer )
720
721
tokenizer .vocabulary [rare_token ] = max (tokenizer .vocabulary .values ()) + 1
721
722
reduced_vocabulary (tokenizer )
723
+
724
+
725
+ def test_reduced_vocabulary_with_byte_tokens ():
726
+ class MockTokenizer :
727
+ vocabulary = {
728
+ "string" : 1 ,
729
+ b"\xa1 " : 2 , # Qwen-Style
730
+ "eos" : 3 ,
731
+ }
732
+ special_tokens = {"eos" }
733
+ eos_token_id = 3
734
+
735
+ def convert_token_to_string (self , token ):
736
+ return b"\xef \xbf \xbd " .decode ()
737
+
738
+ reduced_vocab = reduced_vocabulary (MockTokenizer ())
739
+
740
+ # See fsm.regex.get_token_transition_keys()
741
+ # FSM transition keys represents bytes as <null_prefix><hex_byte>
742
+ assert reduced_vocab [0 ][1 ][0 ] == "\x00 A1"
You can’t perform that action at this time.
0 commit comments