From 9b6d273aa479f3f92113ccdcf39d6ad88d2bed3c Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Mon, 30 Sep 2024 10:56:07 +0200 Subject: [PATCH] Llama test tokenizer uses the new merges style --- Tests/TokenizersTests/Resources/tokenizer_tests.json | 2 +- Tests/TokenizersTests/TokenizerTests.swift | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Tests/TokenizersTests/Resources/tokenizer_tests.json b/Tests/TokenizersTests/Resources/tokenizer_tests.json index 4cf624c..6da45f4 100755 --- a/Tests/TokenizersTests/Resources/tokenizer_tests.json +++ b/Tests/TokenizersTests/Resources/tokenizer_tests.json @@ -1 +1 @@ -{"bert-base-uncased": [{"input": "hello world", "encoded": {"input_ids": [101, 7592, 2088, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hello world [SEP]", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [101, 7592, 2088, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hello world [SEP]", "decoded_without_special": "hello world"}, {"input": "How are you doing?", "encoded": {"input_ids": [101, 2129, 2024, 2017, 2725, 1029, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] how are you doing? [SEP]", "decoded_without_special": "how are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [101, 2017, 2323, 1005, 2310, 2589, 2023, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] you should've done this [SEP]", "decoded_without_special": "you should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [101, 1037, 1005, 2222, 999, 999, 2000, 1029, 1005, 1040, 1005, 1005, 1040, 1997, 1010, 2064, 1005, 1056, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] a'll!! to?'d'' d of, can't. [SEP]", "decoded_without_special": "a'll!! to?'d'' d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [101, 13366, 2364, 1006, 1007, 1024, 3413, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] def main ( ) : pass [SEP]", "decoded_without_special": "def main ( ) : pass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [101, 2023, 2003, 1037, 3231, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] this is a test. [SEP]", "decoded_without_special": "this is a test."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [101, 2292, 1037, 1027, 27885, 3501, 1012, 2000, 3367, 4892, 1006, 1007, 1025, 2000, 3367, 4892, 1006, 1007, 1025, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] let a = obj. tostring ( ) ; tostring ( ) ; [SEP]", "decoded_without_special": "let a = obj. tostring ( ) ; tostring ( ) ;"}, {"input": "Hi Hello", "encoded": {"input_ids": [101, 7632, 7592, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hi hello [SEP]", "decoded_without_special": "hi hello"}, {"input": "trailing space ", "encoded": {"input_ids": [101, 12542, 2686, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] trailing space [SEP]", "decoded_without_special": "trailing space"}, {"input": " leading space", "encoded": {"input_ids": [101, 2877, 2686, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] leading space [SEP]", "decoded_without_special": "leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [101, 1910, 100, 1916, 1921, 100, 100, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] \u751f [UNK] \u7684 \u771f [UNK] [UNK] [SEP]", "decoded_without_special": "\u751f \u7684 \u771f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [101, 1996, 2194, 2001, 2631, 1999, 2355, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] the company was founded in 2016. [SEP]", "decoded_without_special": "the company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [101, 3231, 1002, 1015, 1054, 2475, 1001, 1017, 1574, 2549, 27813, 1071, 2575, 100, 1576, 2620, 1575, 2683, 3231, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] test $ 1 r2 # 3 \u20ac4 \u00a35 \u00a56 [UNK] \u20b98 \u20b19 test [SEP]", "decoded_without_special": "test $ 1 r2 # 3 \u20ac4 \u00a35 \u00a56 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [101, 1045, 4149, 2019, 6207, 2005, 1002, 1015, 1012, 4002, 2012, 1996, 3573, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] i bought an apple for $ 1. 00 at the store. [SEP]", "decoded_without_special": "i bought an apple for $ 1. 00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [101, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 [SEP]", "decoded_without_special": "you \u2026"}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [101, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 [SEP]", "decoded_without_special": "you \u2026"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [101, 2017, 1529, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 you \u2026 [SEP]", "decoded_without_special": "you \u2026 you \u2026"}], "distilgpt2": [{"input": "hello world", "encoded": {"input_ids": [31373, 995], "attention_mask": [1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [15496, 2159], "attention_mask": [1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [2437, 389, 345, 1804, 30], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1639, 815, 1053, 1760, 428], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [32, 198, 1183, 37867, 1462, 8348, 67, 7061, 67, 286, 11, 460, 470, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll!!to?'d''d of, can't.", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [4299, 1388, 33529, 198, 197, 6603], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1212, 198, 198, 271, 198, 64, 198, 9288, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1616, 257, 796, 26181, 13, 1462, 10100, 9783, 198, 1462, 10100, 9783], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [17250, 220, 18435], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [9535, 4386, 2272, 220, 220, 220], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [220, 220, 3756, 2272], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [464, 1664, 373, 9393, 287, 1584, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [9288, 720, 16, 371, 17, 1303, 18, 10432, 19, 4248, 20, 38221, 21, 2343, 224, 96, 22, 2343, 224, 117, 23, 2343, 224, 109, 24, 1332], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [40, 5839, 281, 17180, 329, 720, 16, 13, 405, 379, 262, 3650, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [5832, 1399, 220, 220], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5832, 1399, 4603], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5832, 1399, 1849, 1849, 5832, 1399, 4603], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "coreml-projects/Llama-2-7b-chat-coreml": [{"input": "hello world", "encoded": {"input_ids": [1, 22172, 3186], "attention_mask": [1, 1, 1]}, "decoded_with_special": " hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [1, 15043, 2787], "attention_mask": [1, 1, 1]}, "decoded_with_special": " Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [1, 1128, 526, 366, 2599, 29973], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": " How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1, 887, 881, 29915, 345, 2309, 445], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [1, 319, 13, 29915, 645, 21443, 517, 17901, 29881, 4907, 29881, 310, 29892, 508, 29915, 29873, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " A\n'll !!to?'d''d of, can't.", "decoded_without_special": "A\n'll !!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [1, 822, 1667, 7295, 13, 12, 3364], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1, 910, 13, 13, 275, 13, 29874, 13, 1688, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1, 1235, 263, 353, 5446, 29889, 7711, 890, 13, 7711, 890], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [1, 6324, 29871, 15043], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [1, 25053, 2913, 1678], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [1, 1678, 8236, 2913], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " \u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [1, 450, 5001, 471, 11091, 297, 29871, 29906, 29900, 29896, 29953, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [1, 1243, 395, 29896, 390, 29906, 396, 29941, 25540, 29946, 15151, 29945, 29871, 30563, 29953, 29871, 229, 133, 166, 29955, 29871, 30620, 29947, 29871, 229, 133, 180, 29929, 1243], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [1, 306, 18093, 385, 26163, 363, 395, 29896, 29889, 29900, 29900, 472, 278, 3787, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [1, 366, 30098, 259], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [1, 366, 30098, 8655], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [1, 366, 30098, 8655, 6293, 30098, 8655], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "tiiuae/falcon-7b": [{"input": "hello world", "encoded": {"input_ids": [30835, 1079], "token_type_ids": [0, 0], "attention_mask": [1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [9856, 2889], "token_type_ids": [0, 0], "attention_mask": [1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [1830, 362, 299, 1836, 42], "token_type_ids": [0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1357, 808, 18, 298, 1782, 414], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [44, 193, 18, 567, 204, 1409, 534, 12493, 79, 7544, 79, 275, 23, 418, 18, 95, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll!!to?'d''d of, can't.", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [3071, 1316, 13160, 193, 192, 5412], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1182, 193, 193, 259, 193, 76, 193, 4780, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1025, 241, 204, 40, 13756, 25, 19409, 2032, 193, 19409, 2032], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [5516, 204, 23090], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [9172, 4447, 2151, 466], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [258, 3736, 2151], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [32725, 1105, 15498, 8061, 233, 2364], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [487, 1438, 398, 9923, 272, 204, 626, 33, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [4780, 204, 15, 28, 382, 29, 204, 14, 30, 6471, 31, 5131, 32, 3068, 110, 33, 25631, 108, 34, 25631, 129, 35, 25631, 121, 36, 1318], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [52, 5659, 267, 12381, 312, 204, 15, 28, 25, 527, 388, 248, 2946, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [5667, 898, 258], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5667, 898, 60482], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5667, 898, 4381, 4381, 5667, 898, 60482], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}, {"input": "12 and 123 and 1234", "encoded": {"input_ids": [928, 273, 204, 10963, 273, 204, 10963, 31], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "12 and 123 and 1234", "decoded_without_special": "12 and 123 and 1234"}], "pcuenq/gemma-tokenizer": [{"input": "hello world", "encoded": {"input_ids": [2, 17534, 2134], "attention_mask": [1, 1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [2, 4521, 3855], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [2, 2299, 708, 692, 3900, 235336], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [2, 2045, 1412, 235303, 524, 3015, 736], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [2, 235280, 108, 235303, 529, 9063, 511, 18016, 235258, 3404, 235258, 576, 235269, 798, 235303, 235251, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll !!to?'d''d of, can't.", "decoded_without_special": "A\n'll !!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [2, 1293, 1872, 4409, 108, 226, 3095], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [2, 1596, 109, 502, 108, 235250, 108, 2195, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [2, 1243, 476, 589, 6555, 235265, 7114, 821, 108, 7114, 821], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [2, 2151, 139, 4521], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [2, 100504, 3641, 140], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [2, 140, 26650, 3641], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [2, 122182, 235710, 245467, 235427], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [2, 651, 3277, 729, 18942, 575, 235248, 235284, 235276, 235274, 235318, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [2, 2195, 697, 235274, 625, 235284, 1700, 235304, 8296, 235310, 5955, 235308, 74393, 235318, 235248, 252058, 235324, 56712, 235321, 235248, 243132, 235315, 2121], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [2, 235285, 8989, 671, 15491, 604, 697, 235274, 235265, 235276, 235276, 696, 573, 4659, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [2, 4747, 235417, 139], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [2, 4747, 235417, 25445], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [2, 4747, 235417, 25445, 4747, 235417, 25445], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "openai/whisper-tiny.en": [{"input": "hello world", "encoded": {"input_ids": [50257, 50362, 31373, 995, 50256], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>hello world<|endoftext|>", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [50257, 50362, 15496, 2159, 50256], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hello World<|endoftext|>", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [50257, 50362, 2437, 389, 345, 1804, 30, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>How are you doing?<|endoftext|>", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [50257, 50362, 1639, 815, 1053, 1760, 428, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>You should've done this<|endoftext|>", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [50257, 50362, 32, 198, 1183, 37867, 1462, 8348, 67, 7061, 67, 286, 11, 460, 470, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>A\n'll!!to?'d''d of, can't.<|endoftext|>", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [50257, 50362, 4299, 1388, 33529, 198, 197, 6603, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>def main():\n\tpass<|endoftext|>", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [50257, 50362, 1212, 198, 198, 271, 198, 64, 198, 9288, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>This\n\nis\na\ntest.<|endoftext|>", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [50257, 50362, 1616, 257, 796, 26181, 13, 1462, 10100, 9783, 198, 1462, 10100, 9783, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>let a = obj.toString();\ntoString();<|endoftext|>", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [50257, 50362, 17250, 220, 18435, 50256], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hi Hello<|endoftext|>", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [50257, 50362, 9535, 4386, 2272, 220, 220, 220, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>trailing space <|endoftext|>", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [50257, 50362, 220, 220, 3756, 2272, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|> leading space<|endoftext|>", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [50257, 50362, 37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>\u751f\u6d3b\u7684\u771f\u8c1b\u662f<|endoftext|>", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [50257, 50362, 464, 1664, 373, 9393, 287, 1584, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>The company was founded in 2016.<|endoftext|>", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [50257, 50362, 9288, 720, 16, 371, 17, 1303, 18, 10432, 19, 4248, 20, 38221, 21, 2343, 224, 96, 22, 2343, 224, 117, 23, 2343, 224, 109, 24, 1332, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test<|endoftext|>", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [50257, 50362, 40, 5839, 281, 17180, 329, 720, 16, 13, 405, 379, 262, 3650, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>I bought an apple for $1.00 at the store.<|endoftext|>", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [50257, 50362, 5832, 1399, 220, 220, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026 <|endoftext|>", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50257, 50362, 5832, 1399, 4603, 50256], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50257, 50362, 5832, 1399, 1849, 1849, 5832, 1399, 4603, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "openai/whisper-large-v2": [{"input": "hello world", "encoded": {"input_ids": [50258, 50363, 675, 1913, 1002, 50257], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>hello world<|endoftext|>", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [50258, 50363, 15947, 3937, 50257], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hello World<|endoftext|>", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [50258, 50363, 6462, 366, 291, 884, 30, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>How are you doing?<|endoftext|>", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [50258, 50363, 3223, 820, 600, 1096, 341, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>You should've done this<|endoftext|>", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [50258, 50363, 32, 198, 603, 15138, 1353, 8569, 67, 15025, 67, 295, 11, 393, 380, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>A\n'll!!to?'d''d of, can't.<|endoftext|>", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [50258, 50363, 20595, 2135, 7, 4507, 198, 197, 9216, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>def main():\n\tpass<|endoftext|>", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [50258, 50363, 5723, 198, 198, 271, 198, 64, 198, 31636, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>This\n\nis\na\ntest.<|endoftext|>", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [50258, 50363, 2631, 257, 6585, 1111, 73, 13, 1353, 4520, 2937, 7, 34446, 198, 1353, 4520, 2937, 7, 34446, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>let a = obj.toString();\ntoString();<|endoftext|>", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [50258, 50363, 17155, 220, 2425, 50257], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hi Hello<|endoftext|>", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [50258, 50363, 17227, 4883, 1901, 220, 220, 220, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>trailing space <|endoftext|>", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [50258, 50363, 220, 220, 5775, 1901, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|> leading space<|endoftext|>", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [50258, 50363, 49958, 1546, 6303, 8897, 249, 1541, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>\u751f\u6d3b\u7684\u771f\u8c1b\u662f<|endoftext|>", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [50258, 50363, 2278, 2237, 390, 13234, 294, 6549, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>The company was founded in 2016.<|endoftext|>", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [50258, 50363, 31636, 1848, 16, 497, 17, 3536, 18, 17450, 19, 14378, 20, 1815, 98, 21, 672, 224, 96, 22, 672, 224, 117, 23, 672, 224, 109, 24, 1500, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test<|endoftext|>", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [50258, 50363, 40, 4243, 364, 10606, 337, 1848, 16, 13, 628, 412, 264, 3531, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>I bought an apple for $1.00 at the store.<|endoftext|>", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [50258, 50363, 5616, 1260, 220, 220, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026 <|endoftext|>", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50258, 50363, 5616, 1260, 126, 254, 126, 254, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50258, 50363, 5616, 1260, 126, 254, 126, 254, 5616, 1260, 126, 254, 126, 254, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "t5-base": [{"input": "hello world", "encoded": {"input_ids": [21820, 296, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [8774, 1150, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [571, 33, 25, 692, 58, 1], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [148, 225, 31, 162, 612, 48, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [71, 3, 31, 195, 3, 1603, 235, 58, 31, 26, 31, 31, 26, 13, 6, 54, 31, 17, 5, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A 'll!!to?'d''d of, can't.", "decoded_without_special": "A 'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [20, 89, 711, 9960, 10, 1903, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main(): pass", "decoded_without_special": "def main(): pass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [100, 19, 3, 9, 794, 5, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This is a test.", "decoded_without_special": "This is a test."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [752, 3, 9, 3274, 3, 32, 115, 354, 5, 235, 11500, 53, 9960, 117, 12, 11500, 53, 9960, 117, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString(); toString();", "decoded_without_special": "let a = obj.toString(); toString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [2018, 8774, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [5032, 53, 628, 1], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "trailing space", "decoded_without_special": "trailing space"}, {"input": " leading space", "encoded": {"input_ids": [1374, 628, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "leading space", "decoded_without_special": "leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [3, 2, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "", "decoded_without_special": ""}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [37, 349, 47, 5710, 16, 4619, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [794, 1970, 391, 357, 20206, 3416, 591, 23978, 3, 2, 948, 3, 2, 940, 3, 2, 927, 3, 2, 1298, 794, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 6 7 8 9 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 6 7 8 9 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [27, 2944, 46, 8947, 21, 1970, 4200, 44, 8, 1078, 5, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [25, 233, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you...", "decoded_without_special": "you..."}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [25, 233, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you...", "decoded_without_special": "you..."}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [25, 233, 25, 233, 1], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "you... you...", "decoded_without_special": "you... you..."}], "mlx-community/Llama-3.2-3B-Instruct-4bit": [{"input": "hello world", "encoded": {"input_ids": [128000, 15339, 1917], "attention_mask": [1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [128000, 9906, 4435], "attention_mask": [1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [128000, 4438, 527, 499, 3815, 30], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [128000, 2675, 1288, 3077, 2884, 420], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [128000, 32, 198, 3358, 11261, 998, 20837, 67, 4708, 67, 315, 11, 649, 956, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>A\n'll!!to?'d''d of, can't.", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [128000, 755, 1925, 4019, 42531], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [128000, 2028, 271, 285, 198, 64, 198, 1985, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [128000, 1169, 264, 284, 2909, 5180, 545, 6712, 2178], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [128000, 13347, 220, 22691], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [128000, 376, 14612, 3634, 262], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [128000, 256, 6522, 3634], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|> leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [128000, 104654, 9554, 89151, 39013, 249, 21043], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [128000, 791, 2883, 574, 18538, 304, 220, 679, 21, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [128000, 1985, 400, 16, 432, 17, 674, 18, 13281, 19, 7083, 20, 72588, 21, 113384, 96, 22, 90891, 23, 113384, 109, 24, 1296], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [128000, 40, 11021, 459, 24149, 369, 400, 16, 13, 410, 520, 279, 3637, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [128000, 9514, 1981, 256], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [128000, 9514, 1981, 9421], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [128000, 9514, 1981, 4194, 4194, 9514, 1981, 9421], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}]} \ No newline at end of file +{"bert-base-uncased": [{"input": "hello world", "encoded": {"input_ids": [101, 7592, 2088, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hello world [SEP]", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [101, 7592, 2088, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hello world [SEP]", "decoded_without_special": "hello world"}, {"input": "How are you doing?", "encoded": {"input_ids": [101, 2129, 2024, 2017, 2725, 1029, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] how are you doing? [SEP]", "decoded_without_special": "how are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [101, 2017, 2323, 1005, 2310, 2589, 2023, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] you should've done this [SEP]", "decoded_without_special": "you should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [101, 1037, 1005, 2222, 999, 999, 2000, 1029, 1005, 1040, 1005, 1005, 1040, 1997, 1010, 2064, 1005, 1056, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] a'll!! to?'d'' d of, can't. [SEP]", "decoded_without_special": "a'll!! to?'d'' d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [101, 13366, 2364, 1006, 1007, 1024, 3413, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] def main ( ) : pass [SEP]", "decoded_without_special": "def main ( ) : pass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [101, 2023, 2003, 1037, 3231, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] this is a test. [SEP]", "decoded_without_special": "this is a test."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [101, 2292, 1037, 1027, 27885, 3501, 1012, 2000, 3367, 4892, 1006, 1007, 1025, 2000, 3367, 4892, 1006, 1007, 1025, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] let a = obj. tostring ( ) ; tostring ( ) ; [SEP]", "decoded_without_special": "let a = obj. tostring ( ) ; tostring ( ) ;"}, {"input": "Hi Hello", "encoded": {"input_ids": [101, 7632, 7592, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] hi hello [SEP]", "decoded_without_special": "hi hello"}, {"input": "trailing space ", "encoded": {"input_ids": [101, 12542, 2686, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] trailing space [SEP]", "decoded_without_special": "trailing space"}, {"input": " leading space", "encoded": {"input_ids": [101, 2877, 2686, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] leading space [SEP]", "decoded_without_special": "leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [101, 1910, 100, 1916, 1921, 100, 100, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] \u751f [UNK] \u7684 \u771f [UNK] [UNK] [SEP]", "decoded_without_special": "\u751f \u7684 \u771f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [101, 1996, 2194, 2001, 2631, 1999, 2355, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] the company was founded in 2016. [SEP]", "decoded_without_special": "the company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [101, 3231, 1002, 1015, 1054, 2475, 1001, 1017, 1574, 2549, 27813, 1071, 2575, 100, 1576, 2620, 1575, 2683, 3231, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] test $ 1 r2 # 3 \u20ac4 \u00a35 \u00a56 [UNK] \u20b98 \u20b19 test [SEP]", "decoded_without_special": "test $ 1 r2 # 3 \u20ac4 \u00a35 \u00a56 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [101, 1045, 4149, 2019, 6207, 2005, 1002, 1015, 1012, 4002, 2012, 1996, 3573, 1012, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] i bought an apple for $ 1. 00 at the store. [SEP]", "decoded_without_special": "i bought an apple for $ 1. 00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [101, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 [SEP]", "decoded_without_special": "you \u2026"}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [101, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 [SEP]", "decoded_without_special": "you \u2026"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [101, 2017, 1529, 2017, 1529, 102], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "[CLS] you \u2026 you \u2026 [SEP]", "decoded_without_special": "you \u2026 you \u2026"}], "distilgpt2": [{"input": "hello world", "encoded": {"input_ids": [31373, 995], "attention_mask": [1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [15496, 2159], "attention_mask": [1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [2437, 389, 345, 1804, 30], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1639, 815, 1053, 1760, 428], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [32, 198, 1183, 37867, 1462, 8348, 67, 7061, 67, 286, 11, 460, 470, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll!!to?'d''d of, can't.", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [4299, 1388, 33529, 198, 197, 6603], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1212, 198, 198, 271, 198, 64, 198, 9288, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1616, 257, 796, 26181, 13, 1462, 10100, 9783, 198, 1462, 10100, 9783], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [17250, 220, 18435], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [9535, 4386, 2272, 220, 220, 220], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [220, 220, 3756, 2272], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [464, 1664, 373, 9393, 287, 1584, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [9288, 720, 16, 371, 17, 1303, 18, 10432, 19, 4248, 20, 38221, 21, 2343, 224, 96, 22, 2343, 224, 117, 23, 2343, 224, 109, 24, 1332], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [40, 5839, 281, 17180, 329, 720, 16, 13, 405, 379, 262, 3650, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [5832, 1399, 220, 220], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5832, 1399, 4603], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5832, 1399, 1849, 1849, 5832, 1399, 4603], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "coreml-projects/Llama-2-7b-chat-coreml": [{"input": "hello world", "encoded": {"input_ids": [1, 22172, 3186], "attention_mask": [1, 1, 1]}, "decoded_with_special": " hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [1, 15043, 2787], "attention_mask": [1, 1, 1]}, "decoded_with_special": " Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [1, 1128, 526, 366, 2599, 29973], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": " How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1, 887, 881, 29915, 345, 2309, 445], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [1, 319, 13, 29915, 645, 21443, 517, 17901, 29881, 4907, 29881, 310, 29892, 508, 29915, 29873, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " A\n'll !!to?'d''d of, can't.", "decoded_without_special": "A\n'll !!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [1, 822, 1667, 7295, 13, 12, 3364], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1, 910, 13, 13, 275, 13, 29874, 13, 1688, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1, 1235, 263, 353, 5446, 29889, 7711, 890, 13, 7711, 890], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [1, 6324, 29871, 15043], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [1, 25053, 2913, 1678], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [1, 1678, 8236, 2913], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " \u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [1, 450, 5001, 471, 11091, 297, 29871, 29906, 29900, 29896, 29953, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [1, 1243, 395, 29896, 390, 29906, 396, 29941, 25540, 29946, 15151, 29945, 29871, 30563, 29953, 29871, 229, 133, 166, 29955, 29871, 30620, 29947, 29871, 229, 133, 180, 29929, 1243], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [1, 306, 18093, 385, 26163, 363, 395, 29896, 29889, 29900, 29900, 472, 278, 3787, 29889], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [1, 366, 30098, 259], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [1, 366, 30098, 8655], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [1, 366, 30098, 8655, 6293, 30098, 8655], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": " you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "tiiuae/falcon-7b": [{"input": "hello world", "encoded": {"input_ids": [30835, 1079], "token_type_ids": [0, 0], "attention_mask": [1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [9856, 2889], "token_type_ids": [0, 0], "attention_mask": [1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [1830, 362, 299, 1836, 42], "token_type_ids": [0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [1357, 808, 18, 298, 1782, 414], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [44, 193, 18, 567, 204, 1409, 534, 12493, 79, 7544, 79, 275, 23, 418, 18, 95, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll!!to?'d''d of, can't.", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [3071, 1316, 13160, 193, 192, 5412], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [1182, 193, 193, 259, 193, 76, 193, 4780, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [1025, 241, 204, 40, 13756, 25, 19409, 2032, 193, 19409, 2032], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [5516, 204, 23090], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [9172, 4447, 2151, 466], "token_type_ids": [0, 0, 0, 0], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [258, 3736, 2151], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [32725, 1105, 15498, 8061, 233, 2364], "token_type_ids": [0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [487, 1438, 398, 9923, 272, 204, 626, 33, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [4780, 204, 15, 28, 382, 29, 204, 14, 30, 6471, 31, 5131, 32, 3068, 110, 33, 25631, 108, 34, 25631, 129, 35, 25631, 121, 36, 1318], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [52, 5659, 267, 12381, 312, 204, 15, 28, 25, 527, 388, 248, 2946, 25], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [5667, 898, 258], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5667, 898, 60482], "token_type_ids": [0, 0, 0], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [5667, 898, 4381, 4381, 5667, 898, 60482], "token_type_ids": [0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}, {"input": "12 and 123 and 1234", "encoded": {"input_ids": [928, 273, 204, 10963, 273, 204, 10963, 31], "token_type_ids": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "12 and 123 and 1234", "decoded_without_special": "12 and 123 and 1234"}], "pcuenq/gemma-tokenizer": [{"input": "hello world", "encoded": {"input_ids": [2, 17534, 2134], "attention_mask": [1, 1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [2, 4521, 3855], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [2, 2299, 708, 692, 3900, 235336], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [2, 2045, 1412, 235303, 524, 3015, 736], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [2, 235280, 108, 235303, 529, 9063, 511, 18016, 235258, 3404, 235258, 576, 235269, 798, 235303, 235251, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A\n'll !!to?'d''d of, can't.", "decoded_without_special": "A\n'll !!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [2, 1293, 1872, 4409, 108, 226, 3095], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [2, 1596, 109, 502, 108, 235250, 108, 2195, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [2, 1243, 476, 589, 6555, 235265, 7114, 821, 108, 7114, 821], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [2, 2151, 139, 4521], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [2, 100504, 3641, 140], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [2, 140, 26650, 3641], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": " leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [2, 122182, 235710, 245467, 235427], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [2, 651, 3277, 729, 18942, 575, 235248, 235284, 235276, 235274, 235318, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [2, 2195, 697, 235274, 625, 235284, 1700, 235304, 8296, 235310, 5955, 235308, 74393, 235318, 235248, 252058, 235324, 56712, 235321, 235248, 243132, 235315, 2121], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [2, 235285, 8989, 671, 15491, 604, 697, 235274, 235265, 235276, 235276, 696, 573, 4659, 235265], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [2, 4747, 235417, 139], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [2, 4747, 235417, 25445], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [2, 4747, 235417, 25445, 4747, 235417, 25445], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "openai/whisper-tiny.en": [{"input": "hello world", "encoded": {"input_ids": [50257, 50362, 31373, 995, 50256], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>hello world<|endoftext|>", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [50257, 50362, 15496, 2159, 50256], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hello World<|endoftext|>", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [50257, 50362, 2437, 389, 345, 1804, 30, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>How are you doing?<|endoftext|>", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [50257, 50362, 1639, 815, 1053, 1760, 428, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>You should've done this<|endoftext|>", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [50257, 50362, 32, 198, 1183, 37867, 1462, 8348, 67, 7061, 67, 286, 11, 460, 470, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>A\n'll!!to?'d''d of, can't.<|endoftext|>", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [50257, 50362, 4299, 1388, 33529, 198, 197, 6603, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>def main():\n\tpass<|endoftext|>", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [50257, 50362, 1212, 198, 198, 271, 198, 64, 198, 9288, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>This\n\nis\na\ntest.<|endoftext|>", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [50257, 50362, 1616, 257, 796, 26181, 13, 1462, 10100, 9783, 198, 1462, 10100, 9783, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>let a = obj.toString();\ntoString();<|endoftext|>", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [50257, 50362, 17250, 220, 18435, 50256], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hi Hello<|endoftext|>", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [50257, 50362, 9535, 4386, 2272, 220, 220, 220, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>trailing space <|endoftext|>", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [50257, 50362, 220, 220, 3756, 2272, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|> leading space<|endoftext|>", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [50257, 50362, 37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>\u751f\u6d3b\u7684\u771f\u8c1b\u662f<|endoftext|>", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [50257, 50362, 464, 1664, 373, 9393, 287, 1584, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>The company was founded in 2016.<|endoftext|>", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [50257, 50362, 9288, 720, 16, 371, 17, 1303, 18, 10432, 19, 4248, 20, 38221, 21, 2343, 224, 96, 22, 2343, 224, 117, 23, 2343, 224, 109, 24, 1332, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test<|endoftext|>", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [50257, 50362, 40, 5839, 281, 17180, 329, 720, 16, 13, 405, 379, 262, 3650, 13, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>I bought an apple for $1.00 at the store.<|endoftext|>", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [50257, 50362, 5832, 1399, 220, 220, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026 <|endoftext|>", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50257, 50362, 5832, 1399, 4603, 50256], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50257, 50362, 5832, 1399, 1849, 1849, 5832, 1399, 4603, 50256], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "openai/whisper-large-v2": [{"input": "hello world", "encoded": {"input_ids": [50258, 50363, 675, 1913, 1002, 50257], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>hello world<|endoftext|>", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [50258, 50363, 15947, 3937, 50257], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hello World<|endoftext|>", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [50258, 50363, 6462, 366, 291, 884, 30, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>How are you doing?<|endoftext|>", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [50258, 50363, 3223, 820, 600, 1096, 341, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>You should've done this<|endoftext|>", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [50258, 50363, 32, 198, 603, 15138, 1353, 8569, 67, 15025, 67, 295, 11, 393, 380, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>A\n'll!!to?'d''d of, can't.<|endoftext|>", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [50258, 50363, 20595, 2135, 7, 4507, 198, 197, 9216, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>def main():\n\tpass<|endoftext|>", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [50258, 50363, 5723, 198, 198, 271, 198, 64, 198, 31636, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>This\n\nis\na\ntest.<|endoftext|>", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [50258, 50363, 2631, 257, 6585, 1111, 73, 13, 1353, 4520, 2937, 7, 34446, 198, 1353, 4520, 2937, 7, 34446, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>let a = obj.toString();\ntoString();<|endoftext|>", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [50258, 50363, 17155, 220, 2425, 50257], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>Hi Hello<|endoftext|>", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [50258, 50363, 17227, 4883, 1901, 220, 220, 220, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>trailing space <|endoftext|>", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [50258, 50363, 220, 220, 5775, 1901, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|> leading space<|endoftext|>", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [50258, 50363, 49958, 1546, 6303, 8897, 249, 1541, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>\u751f\u6d3b\u7684\u771f\u8c1b\u662f<|endoftext|>", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [50258, 50363, 2278, 2237, 390, 13234, 294, 6549, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>The company was founded in 2016.<|endoftext|>", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [50258, 50363, 31636, 1848, 16, 497, 17, 3536, 18, 17450, 19, 14378, 20, 1815, 98, 21, 672, 224, 96, 22, 672, 224, 117, 23, 672, 224, 109, 24, 1500, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test<|endoftext|>", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [50258, 50363, 40, 4243, 364, 10606, 337, 1848, 16, 13, 628, 412, 264, 3531, 13, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>I bought an apple for $1.00 at the store.<|endoftext|>", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [50258, 50363, 5616, 1260, 220, 220, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026 <|endoftext|>", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50258, 50363, 5616, 1260, 126, 254, 126, 254, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [50258, 50363, 5616, 1260, 126, 254, 126, 254, 5616, 1260, 126, 254, 126, 254, 50257], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|startoftranscript|><|notimestamps|>you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0<|endoftext|>", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}], "t5-base": [{"input": "hello world", "encoded": {"input_ids": [21820, 296, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [8774, 1150, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [571, 33, 25, 692, 58, 1], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [148, 225, 31, 162, 612, 48, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [71, 3, 31, 195, 3, 1603, 235, 58, 31, 26, 31, 31, 26, 13, 6, 54, 31, 17, 5, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "A 'll!!to?'d''d of, can't.", "decoded_without_special": "A 'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [20, 89, 711, 9960, 10, 1903, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "def main(): pass", "decoded_without_special": "def main(): pass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [100, 19, 3, 9, 794, 5, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "This is a test.", "decoded_without_special": "This is a test."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [752, 3, 9, 3274, 3, 32, 115, 354, 5, 235, 11500, 53, 9960, 117, 12, 11500, 53, 9960, 117, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "let a = obj.toString(); toString();", "decoded_without_special": "let a = obj.toString(); toString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [2018, 8774, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [5032, 53, 628, 1], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "trailing space", "decoded_without_special": "trailing space"}, {"input": " leading space", "encoded": {"input_ids": [1374, 628, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "leading space", "decoded_without_special": "leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [3, 2, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "", "decoded_without_special": ""}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [37, 349, 47, 5710, 16, 4619, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [794, 1970, 391, 357, 20206, 3416, 591, 23978, 3, 2, 948, 3, 2, 940, 3, 2, 927, 3, 2, 1298, 794, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "test $1 R2 #3 \u20ac4 \u00a35 6 7 8 9 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 6 7 8 9 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [27, 2944, 46, 8947, 21, 1970, 4200, 44, 8, 1078, 5, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [25, 233, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you...", "decoded_without_special": "you..."}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [25, 233, 1], "attention_mask": [1, 1, 1]}, "decoded_with_special": "you...", "decoded_without_special": "you..."}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [25, 233, 25, 233, 1], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "you... you...", "decoded_without_special": "you... you..."}], "pcuenq/Llama-3.2-1B-Instruct-tokenizer": [{"input": "hello world", "encoded": {"input_ids": [128000, 15339, 1917], "attention_mask": [1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>hello world", "decoded_without_special": "hello world"}, {"input": "Hello World", "encoded": {"input_ids": [128000, 9906, 4435], "attention_mask": [1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>Hello World", "decoded_without_special": "Hello World"}, {"input": "How are you doing?", "encoded": {"input_ids": [128000, 4438, 527, 499, 3815, 30], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>How are you doing?", "decoded_without_special": "How are you doing?"}, {"input": "You should've done this", "encoded": {"input_ids": [128000, 2675, 1288, 3077, 2884, 420], "attention_mask": [1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>You should've done this", "decoded_without_special": "You should've done this"}, {"input": "A\n'll !!to?'d''d of, can't.", "encoded": {"input_ids": [128000, 32, 198, 3358, 11261, 998, 20837, 67, 4708, 67, 315, 11, 649, 956, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>A\n'll!!to?'d''d of, can't.", "decoded_without_special": "A\n'll!!to?'d''d of, can't."}, {"input": "def main():\n\tpass", "encoded": {"input_ids": [128000, 755, 1925, 4019, 42531], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>def main():\n\tpass", "decoded_without_special": "def main():\n\tpass"}, {"input": "This\n\nis\na\ntest.", "encoded": {"input_ids": [128000, 2028, 271, 285, 198, 64, 198, 1985, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>This\n\nis\na\ntest.", "decoded_without_special": "This\n\nis\na\ntest."}, {"input": "let a = obj.toString();\ntoString();", "encoded": {"input_ids": [128000, 1169, 264, 284, 2909, 5180, 545, 6712, 2178], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>let a = obj.toString();\ntoString();", "decoded_without_special": "let a = obj.toString();\ntoString();"}, {"input": "Hi Hello", "encoded": {"input_ids": [128000, 13347, 220, 22691], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>Hi Hello", "decoded_without_special": "Hi Hello"}, {"input": "trailing space ", "encoded": {"input_ids": [128000, 376, 14612, 3634, 262], "attention_mask": [1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>trailing space ", "decoded_without_special": "trailing space "}, {"input": " leading space", "encoded": {"input_ids": [128000, 256, 6522, 3634], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|> leading space", "decoded_without_special": " leading space"}, {"input": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "encoded": {"input_ids": [128000, 104654, 9554, 89151, 39013, 249, 21043], "attention_mask": [1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>\u751f\u6d3b\u7684\u771f\u8c1b\u662f", "decoded_without_special": "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"}, {"input": "The company was founded in 2016.", "encoded": {"input_ids": [128000, 791, 2883, 574, 18538, 304, 220, 679, 21, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>The company was founded in 2016.", "decoded_without_special": "The company was founded in 2016."}, {"input": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "encoded": {"input_ids": [128000, 1985, 400, 16, 432, 17, 674, 18, 13281, 19, 7083, 20, 72588, 21, 113384, 96, 22, 90891, 23, 113384, 109, 24, 1296], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", "decoded_without_special": "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test"}, {"input": "I bought an apple for $1.00 at the store.", "encoded": {"input_ids": [128000, 40, 11021, 459, 24149, 369, 400, 16, 13, 410, 520, 279, 3637, 13], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>I bought an apple for $1.00 at the store.", "decoded_without_special": "I bought an apple for $1.00 at the store."}, {"input": "you\u2026 ", "encoded": {"input_ids": [128000, 9514, 1981, 256], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>you\u2026 ", "decoded_without_special": "you\u2026 "}, {"input": "you\u2026\u00a0\u00a0", "encoded": {"input_ids": [128000, 9514, 1981, 9421], "attention_mask": [1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0"}, {"input": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "encoded": {"input_ids": [128000, 9514, 1981, 4194, 4194, 9514, 1981, 9421], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}, "decoded_with_special": "<|begin_of_text|>you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", "decoded_without_special": "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0"}]} diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index 809f589..2b69696 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -38,7 +38,7 @@ class LlamaTokenizerTests: TokenizerTests { } class Llama32TokenizerTests: TokenizerTests { - override class var hubModelName: String? { "mlx-community/Llama-3.2-3B-Instruct-4bit" } + override class var hubModelName: String? { "pcuenq/Llama-3.2-1B-Instruct-tokenizer" } override class var encodedSamplesFilename: String? { "llama_3.2_encoded" } }