Skip to content

Commit

Permalink
fix eof negative look-ahead
Browse files Browse the repository at this point in the history
  • Loading branch information
aneubeck committed Oct 18, 2024
1 parent 25188c8 commit 3a66cb1
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 4 deletions.
8 changes: 5 additions & 3 deletions crates/bpe-openai/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+";
let pat1 = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+$";
let pat2 = "\\s+\\s";
let pat3 = "\\s+";
Tokenizer::with_many(bpe, &[pat1, pat2, pat3]).expect("valid regex")
Expand All @@ -23,9 +23,10 @@ static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+";
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
// Note: Rewrite the negative look-ahead with a positive pseudo look-ahead.
// The look-ahead character is dropped from the match by the SpecialRegexp iterator.
// Note: The negative look-ahead requires also the pattern `\\s+$` to handle end of file without dropping a character!
let pat2 = "\\s+\\s";
let pat3 = "\\s+";
Tokenizer::with_many(bpe, &[pat1, pat2, pat3]).expect("valid regex")
Expand All @@ -40,6 +41,7 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
"\\p{N}{1,3}",
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
"\\s*[\\r\\n]+",
"\\s+$",
].join("|");
let pat2 = "\\s+\\s";
let pat3 = "\\s+";
Expand Down Expand Up @@ -109,7 +111,7 @@ impl Tokenizer {
/// second pattern is always a look-ahead pattern, and that just a single character needs
/// to be dropped. With this little hack, we can keep most of the regex patterns as they are,
/// but achieve a >3x speedup.
///
///
/// Alternatively, this could have been implemented with capture groups, but those were ~30%
/// slower than this approach with multiple patterns.
struct SpecialRegexp<'a> {
Expand Down
2 changes: 1 addition & 1 deletion crates/bpe/benchmarks/equivalence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ fn test_encoding_equivalence_with_pretokenization() {
let inputs = (0..N)
.map(|_| select_test_bytes(text.as_bytes(), 100))
.chain(std::iter::once(
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
"You should see the Greek word 'kosme': \"κόσμε\" ".as_bytes(),
));
for input in inputs {
let text = std::str::from_utf8(input).unwrap();
Expand Down

0 comments on commit 3a66cb1

Please sign in to comment.