From 57cef4126978fb4356011149b78e9b688b019468 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Mon, 7 Oct 2024 12:56:46 +0200 Subject: [PATCH 1/3] Add warning and test about tiktoken's input splitting --- crates/bpe-openai/Cargo.toml | 4 ++++ crates/bpe-openai/README.md | 4 ++++ crates/bpe-openai/src/lib.rs | 31 +++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml index 2975731..1e6f80a 100644 --- a/crates/bpe-openai/Cargo.toml +++ b/crates/bpe-openai/Cargo.toml @@ -17,6 +17,10 @@ bpe = { version = "0.1.0", path = "../bpe" } rmp-serde = "1" serde = { version = "1" } +[dev-dependencies] +fancy-regex = "0.13" +tiktoken-rs = { version = "0.5" } + [build-dependencies] bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] } rmp-serde = "1" diff --git a/crates/bpe-openai/README.md b/crates/bpe-openai/README.md index e06d488..8604368 100644 --- a/crates/bpe-openai/README.md +++ b/crates/bpe-openai/README.md @@ -12,6 +12,10 @@ Supported token sets: - cl100k - o200k +> **⚠ CAUTION ⚠** +> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding. +> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken. + ## Usage Add a dependency by running diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs index 65c3619..66d9f99 100644 --- a/crates/bpe-openai/src/lib.rs +++ b/crates/bpe-openai/src/lib.rs @@ -42,6 +42,8 @@ pub fn o200k() -> &'static BytePairEncoding { #[cfg(test)] mod tests { + use tiktoken_rs::cl100k_base_singleton; + use super::*; #[test] @@ -63,4 +65,33 @@ mod tests { fn can_load_o200k() { o200k().count("".as_bytes()); } + + /// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting. + #[test] + fn splitting_difference() { + let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle"; + let input = text.as_bytes(); + let expected: Vec<_> = cl100k_base_singleton() + .lock() + .encode_ordinary(text) + .into_iter() + .map(|i| i as u32) + .collect(); + + let without_splitting = BPE_CL100K.encode_via_backtracking(&input); + assert_ne!(without_splitting, expected); + + let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"; + let re = fancy_regex::Regex::new(pat).unwrap(); + println!("{}", re.find_iter(text).count()); + let with_splitting: Vec<_> = re + .find_iter(text) + .flat_map(|piece| { + BPE_CL100K + .encode_via_backtracking(piece.unwrap().as_str().as_bytes()) + .into_iter() + }) + .collect(); + assert_eq!(with_splitting, expected); + } } From 6d682520e10b4fac378a4b8b3872fe907a71e4d3 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Mon, 7 Oct 2024 13:09:57 +0200 Subject: [PATCH 2/3] Bump version --- crates/bpe-openai/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml index 1e6f80a..c3929ed 100644 --- a/crates/bpe-openai/Cargo.toml +++ b/crates/bpe-openai/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bpe-openai" -version = "0.1.0" +version = "0.1.1" edition = "2021" description = "Prebuilt fast byte-pair encoders for OpenAI." repository = "https://github.com/github/rust-gems" From 0fad66c91fc65054fbc17b033a1b7b73e7ae582e Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Mon, 7 Oct 2024 14:12:45 +0200 Subject: [PATCH 3/3] clippy --- crates/bpe-openai/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs index 66d9f99..283f004 100644 --- a/crates/bpe-openai/src/lib.rs +++ b/crates/bpe-openai/src/lib.rs @@ -78,7 +78,7 @@ mod tests { .map(|i| i as u32) .collect(); - let without_splitting = BPE_CL100K.encode_via_backtracking(&input); + let without_splitting = BPE_CL100K.encode_via_backtracking(input); assert_ne!(without_splitting, expected); let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";