fix eof negative look-ahead

aneubeck · aneubeck · commit 3a66cb1c0ff1 · 2024-10-18T06:45:00.000Z
diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs
@@ -14,7 +14,7 @@ static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
 static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
-    let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+";
+    let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+$";
     let pat2 = "\\s+\\s";
     let pat3 = "\\s+";
     Tokenizer::with_many(bpe, &[pat1, pat2, pat3]).expect("valid regex")
@@ -23,9 +23,10 @@ static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
 static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
-    let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+";
+    let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
     // Note: Rewrite the negative look-ahead with a positive pseudo look-ahead.
     // The look-ahead character is dropped from the match by the SpecialRegexp iterator.
+    // Note: The negative look-ahead requires also the pattern `\\s+$` to handle end of file without dropping a character!
     let pat2 = "\\s+\\s";
     let pat3 = "\\s+";
     Tokenizer::with_many(bpe, &[pat1, pat2, pat3]).expect("valid regex")
@@ -40,6 +41,7 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
         "\\p{N}{1,3}",
         " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
         "\\s*[\\r\\n]+",
+        "\\s+$",
     ].join("|");
     let pat2 = "\\s+\\s";
     let pat3 = "\\s+";
@@ -109,7 +111,7 @@ impl Tokenizer {
 /// second pattern is always a look-ahead pattern, and that just a single character needs
 /// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
 /// but achieve a >3x speedup.
-/// 
+///
 /// Alternatively, this could have been implemented with capture groups, but those were ~30%
 /// slower than this approach with multiple patterns.
 struct SpecialRegexp<'a> {
diff --git a/crates/bpe/benchmarks/equivalence.rs b/crates/bpe/benchmarks/equivalence.rs
@@ -47,7 +47,7 @@ fn test_encoding_equivalence_with_pretokenization() {
         let inputs = (0..N)
             .map(|_| select_test_bytes(text.as_bytes(), 100))
             .chain(std::iter::once(
-                "You should see the Greek word 'kosme':       \"κόσμε\"".as_bytes(),
+                "You should see the Greek word 'kosme':       \"κόσμε\"   ".as_bytes(),
             ));
         for input in inputs {
             let text = std::str::from_utf8(input).unwrap();