Skip to content

Commit 3a66cb1

Browse files
committed
fix eof negative look-ahead
1 parent 25188c8 commit 3a66cb1

File tree

2 files changed

+6
-4
lines changed

2 files changed

+6
-4
lines changed

crates/bpe-openai/src/lib.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
1414
static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
1515
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
1616
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
17-
let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+";
17+
let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+$";
1818
let pat2 = "\\s+\\s";
1919
let pat3 = "\\s+";
2020
Tokenizer::with_many(bpe, &[pat1, pat2, pat3]).expect("valid regex")
@@ -23,9 +23,10 @@ static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
2323
static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
2424
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
2525
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
26-
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+";
26+
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
2727
// Note: Rewrite the negative look-ahead with a positive pseudo look-ahead.
2828
// The look-ahead character is dropped from the match by the SpecialRegexp iterator.
29+
// Note: The negative look-ahead requires also the pattern `\\s+$` to handle end of file without dropping a character!
2930
let pat2 = "\\s+\\s";
3031
let pat3 = "\\s+";
3132
Tokenizer::with_many(bpe, &[pat1, pat2, pat3]).expect("valid regex")
@@ -40,6 +41,7 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
4041
"\\p{N}{1,3}",
4142
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
4243
"\\s*[\\r\\n]+",
44+
"\\s+$",
4345
].join("|");
4446
let pat2 = "\\s+\\s";
4547
let pat3 = "\\s+";
@@ -109,7 +111,7 @@ impl Tokenizer {
109111
/// second pattern is always a look-ahead pattern, and that just a single character needs
110112
/// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
111113
/// but achieve a >3x speedup.
112-
///
114+
///
113115
/// Alternatively, this could have been implemented with capture groups, but those were ~30%
114116
/// slower than this approach with multiple patterns.
115117
struct SpecialRegexp<'a> {

crates/bpe/benchmarks/equivalence.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ fn test_encoding_equivalence_with_pretokenization() {
4747
let inputs = (0..N)
4848
.map(|_| select_test_bytes(text.as_bytes(), 100))
4949
.chain(std::iter::once(
50-
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
50+
"You should see the Greek word 'kosme': \"κόσμε\" ".as_bytes(),
5151
));
5252
for input in inputs {
5353
let text = std::str::from_utf8(input).unwrap();

0 commit comments

Comments
 (0)