@@ -14,7 +14,7 @@ static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
14
14
static BPE_P50K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
15
15
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_p50k_base.dict" ) ) ;
16
16
let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
17
- let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+" ;
17
+ let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\ p{L}+| ?\\ p{N}+| ?[^\\ s\\ p{L}\\ p{N}]+| \\ s+$ " ;
18
18
let pat2 = "\\ s+\\ s" ;
19
19
let pat3 = "\\ s+" ;
20
20
Tokenizer :: with_many ( bpe, & [ pat1, pat2, pat3] ) . expect ( "valid regex" )
@@ -23,9 +23,10 @@ static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
23
23
static BPE_CL100K_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
24
24
let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_cl100k_base.dict" ) ) ;
25
25
let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
26
- let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+" ;
26
+ let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+| \\ s+$ " ;
27
27
// Note: Rewrite the negative look-ahead with a positive pseudo look-ahead.
28
28
// The look-ahead character is dropped from the match by the SpecialRegexp iterator.
29
+ // Note: The negative look-ahead requires also the pattern `\\s+$` to handle end of file without dropping a character!
29
30
let pat2 = "\\ s+\\ s" ;
30
31
let pat3 = "\\ s+" ;
31
32
Tokenizer :: with_many ( bpe, & [ pat1, pat2, pat3] ) . expect ( "valid regex" )
@@ -40,6 +41,7 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
40
41
"\\ p{N}{1,3}" ,
41
42
" ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n/]*" ,
42
43
"\\ s*[\\ r\\ n]+" ,
44
+ "\\ s+$" ,
43
45
] . join ( "|" ) ;
44
46
let pat2 = "\\ s+\\ s" ;
45
47
let pat3 = "\\ s+" ;
@@ -109,7 +111,7 @@ impl Tokenizer {
109
111
/// second pattern is always a look-ahead pattern, and that just a single character needs
110
112
/// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
111
113
/// but achieve a >3x speedup.
112
- ///
114
+ ///
113
115
/// Alternatively, this could have been implemented with capture groups, but those were ~30%
114
116
/// slower than this approach with multiple patterns.
115
117
struct SpecialRegexp < ' a > {
0 commit comments