github · hendrikvanantwerpen · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024
@@ -3,6 +3,7 @@
 members = [
     "crates/*",
     "crates/bpe/benchmarks",
+    "crates/bpe/tests",
 ]
 resolver = "2"
 

@@ -17,13 +17,13 @@ bpe = { version = "0.1.0", path = "../bpe" }
 either = "1.13"
 fancy-regex = "0.13"
 rmp-serde = "1"
-serde = { version = "1" }
 
 [dev-dependencies]
-tiktoken-rs = { version = "0.5" }
+tiktoken-rs = "0.6"
 
 [build-dependencies]
-bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
+base64 = "0.22.1"
+bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken"] }
+flate2 = "1.0"
 rmp-serde = "1"
-tiktoken-rs = { version = "0.5" }
-serde = { version = "1" }
+serde = "1"
@@ -1,51 +1,37 @@
 use std::env;
 use std::fs::File;
+use std::io::Read;
 use std::path::PathBuf;
 
-use bpe::byte_pair_encoding::BytePairEncoding;
+use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
 use serde::Serialize;
-use tiktoken_rs::CoreBPE;
 
 fn main() {
-    serialize_tokens(
-        "r50k",
-        &tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
-        50256,
-        1,
-    );
-    serialize_tokens(
-        "p50k",
-        &tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
-        50280,
-        1,
-    );
-    serialize_tokens(
-        "cl100k",
-        &tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
-        100256,
-        17846336922010275747,
-    );
-    serialize_tokens(
-        "cl100k",
-        &tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
-        100256,
+    serialize_tiktoken_bpe("r50k_base", include_bytes!("data/r50k_base.tiktoken.gz"), 1);
+    serialize_tiktoken_bpe("p50k_base", include_bytes!("data/p50k_base.tiktoken.gz"), 1);
+    serialize_tiktoken_bpe(
+        "cl100k_base",
+        include_bytes!("data/cl100k_base.tiktoken.gz"),
         17846336922010275747,
     );
-    serialize_tokens(
-        "o200k",
-        &tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"),
-        199998,
+    serialize_tiktoken_bpe(
+        "o200k_base",
+        include_bytes!("data/o200k_base.tiktoken.gz"),
         17846336922010275747,
     );
     println!("cargo::rerun-if-changed=build.rs");
 }
 
-fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) {
+fn serialize_tiktoken_bpe(name: &str, data: &[u8], hash_factor: u64) {
+    let mut dec = flate2::read::GzDecoder::new(data);
+    let mut tiktoken = String::new();
+    dec.read_to_string(&mut tiktoken).expect("can decode data");
+    let tokens = read_tiktoken(&tiktoken).expect("can read data");
     let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
     path.push(format!("bpe_{name}.dict"));
     let file = File::create(path).expect("can create output file");
     let mut serializer = rmp_serde::Serializer::new(file);
-    let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor));
+    let bpe = BytePairEncoding::from_dictionary(tokens, Some(hash_factor));
     bpe.serialize(&mut serializer)
         .expect("serialization succeeds");
 }
@@ -4,29 +4,29 @@ use bpe::byte_pair_encoding::BytePairEncoding;
 use either::Either;
 use fancy_regex::Regex;
 
-static BPE_R50K: LazyLock<Tokenizer> = LazyLock::new(|| {
-    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
+static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
     let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
     Tokenizer::new(bpe, Some(pat)).expect("valid regex")
 });
 
-static BPE_P50K: LazyLock<Tokenizer> = LazyLock::new(|| {
-    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
+static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
     let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
     Tokenizer::new(bpe, Some(pat)).expect("valid regex")
 });
 
-static BPE_CL100K: LazyLock<Tokenizer> = LazyLock::new(|| {
-    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
+static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
     let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
     Tokenizer::new(bpe, Some(pat)).expect("valid regex")
 });
 
-static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
-    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
+static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k_base.dict"));
     let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
     let pat = [
         "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
@@ -91,20 +91,20 @@ impl Tokenizer {
     }
 }
 
-pub fn r50k() -> &'static Tokenizer {
-    &BPE_R50K
+pub fn r50k_base() -> &'static Tokenizer {
+    &BPE_R50K_BASE
 }
 
-pub fn p50k() -> &'static Tokenizer {
-    &BPE_P50K
+pub fn p50k_base() -> &'static Tokenizer {
+    &BPE_P50K_BASE
 }
 
-pub fn cl100k() -> &'static Tokenizer {
-    &BPE_CL100K
+pub fn cl100k_base() -> &'static Tokenizer {
+    &BPE_CL100K_BASE
 }
 
-pub fn o200k() -> &'static Tokenizer {
-    &BPE_O200K
+pub fn o200k_base() -> &'static Tokenizer {
+    &BPE_O200K_BASE
 }
 
 #[cfg(test)]
@@ -115,22 +115,22 @@ mod tests {
 
     #[test]
     fn can_load_r50k() {
-        r50k().count("");
+        r50k_base().count("");
     }
 
     #[test]
     fn can_load_p50k() {
-        p50k().count("");
+        p50k_base().count("");
     }
 
     #[test]
     fn can_load_cl100k() {
-        cl100k().count("");
+        cl100k_base().count("");
     }
 
     #[test]
     fn can_load_o200k() {
-        o200k().count("");
+        o200k_base().count("");
     }
 
     /// Test demonstrating a case where input splitting makes a difference.
@@ -142,13 +142,12 @@ mod tests {
             .lock()
             .encode_ordinary(text)
             .into_iter()
-            .map(|i| i as u32)
             .collect();
 
-        let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input);
+        let without_splitting = BPE_CL100K_BASE.bpe.encode_via_backtracking(input);
         assert_ne!(without_splitting, expected);
 
-        let with_splitting: Vec<_> = BPE_CL100K.encode(text);
+        let with_splitting: Vec<_> = BPE_CL100K_BASE.encode(text);
         assert_eq!(with_splitting, expected);
     }
 }
@@ -0,0 +1,39 @@
+# Contributing
+
+Here are specific details that are useful when you want to contribute to the BPE crates.
+Make sure to read the repository's [contribution guidelines][contributing] as well.
+
+## Project structure
+
+This project has a slightly unusual structure to resolve some dependency issues.
+
+- This directory contains `bpe`, the BPE code itself.
+- A sibling directory contains `bpe-openai`, which exposes tokenizers for OpenAI token sets, and depends on `bpe`.
+- Tests are located in the `tests` subdirectory, and benchmarks in the `benchmarks` subdirectory. Both of these are separate crates so they can depend on `bpe-openai` without causing a cyclic dependency.
+
+Only the `bpe` and `bpe-openai` crates are meant to be published. The other ones are for development use only.
+
+## Running benchmarks
+
+Change the working directory to the `benchmarks` directory:
+
+```sh
+cd benchmarks
+```
+
+Run the benchmark as follows (required [cargo-criterion](https://crates.io/crates/cargo-criterion) installed):
+
+```sh
+cargo criterion
+```
+
+(Using `cargo bench` ignores the settings in `criterion.toml`!)
+Open the full report which should be located in `target/criterion/reports/index.html`.
+
+Update the figures in this repo as follows (requires `rsvg-convert` from `librsvg` installed):
+
+```sh
+script/copy-results
+```
+
+[contributing]: ../../CONTRIBUTING.md
@@ -14,16 +14,19 @@ bench = false
 
 [features]
 rand = ["dep:rand"]
-tiktoken-rs = ["dep:tiktoken-rs"]
+tiktoken = ["dep:base64"]
 
 [dependencies]
 aneubeck-daachorse = "1.1.1"
+base64 = { version = "0.22", optional = true }
 fnv = "1.0"
 itertools = "0.12"
 rand = { version = "0.8", optional = true }
-rmp-serde = "1"
 serde = { version = "1", features = ["derive"] }
-tiktoken-rs = { version = "0.5", optional = true }
 
 [dev-dependencies]
-bpe = { path = ".", features = ["rand", "tiktoken-rs"] }
+bpe = { path = "." }
+tiktoken-rs = "0.6"
+
+[package.metadata.docs.rs]
+all-features = true
@@ -296,26 +296,3 @@ The performance of tiktoken shows a quadratic growth with the input size.
 The Huggingface encoder scales better, but becomes slower and slower compared to our implementation as input size increases.
 
 ![worst-case encoding runtime comparison](./images/performance-worstcase.svg)
-
-### Running the benchmarks
-
-Benchmarks are located in a separate crate in the `benchmarks` directory.
-
-```sh
-cd benchmarks
-```
-
-Run the benchmark as follows (required [cargo-criterion](https://crates.io/crates/cargo-criterion) installed):
-
-```sh
-cargo criterion
-```
-
-(Using `cargo bench` ignores the settings in `criterion.toml`!)
-Open the full report which should be located in `target/criterion/reports/index.html`.
-
-Update the figures in this repo as follows (requires `rsvg-convert` from `librsvg` installed):
-
-```sh
-script/copy-results
-```
@@ -18,9 +18,10 @@ path = "equivalence.rs"
 test = true
 
 [dependencies]
-bpe = { path = "../../bpe", features = ["rand", "tiktoken-rs"] }
+bpe = { path = "../../bpe" }
 bpe-openai = { path = "../../bpe-openai" }
+bpe-tests = { path = "../tests" }
 criterion = "0.5"
 rand = "0.8"
-tiktoken-rs = "0.5"
+tiktoken-rs = "0.6"
 tokenizers = { version = "0.20", features = ["http"] }
@@ -16,7 +16,7 @@ fn test_encoding_equivalence_without_pretokenization() {
         for input in inputs {
             let text = std::str::from_utf8(input).unwrap();
             let out = bpe.bpe.encode_via_backtracking(input);
-            let huggingface_out: Vec<_> = huggingface
+            let huggingface_out = huggingface
                 .encode_fast(text, false)
                 .unwrap()
                 .get_ids()
@@ -52,10 +52,10 @@ fn test_encoding_equivalence_with_pretokenization() {
         for input in inputs {
             let text = std::str::from_utf8(input).unwrap();
             let out = bpe.encode(text);
-            let tiktoken_out: Vec<_> = tiktoken.encode_ordinary(text);
-            let tiktoken_out2: Vec<_> = tiktoken_out.iter().map(|i| *i as u32).collect();
+            let tiktoken_out = tiktoken.encode_ordinary(text);
+            let tiktoken_out2 = tiktoken_out.to_vec();
             let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap();
-            let huggingface_out: Vec<_> = huggingface
+            let huggingface_out = huggingface
                 .encode_fast(text, false)
                 .unwrap()
                 .get_ids()

@@ -18,13 +18,13 @@ pub static TOKENIZERS: LazyLock<
     [
         (
             "cl100k",
-            bpe_openai::cl100k(),
+            bpe_openai::cl100k_base(),
             tiktoken_rs::cl100k_base().expect("tokenizer available"),
             HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).expect("model available"),
         ),
         (
             "o200k",
-            bpe_openai::o200k(),
+            bpe_openai::o200k_base(),
             tiktoken_rs::o200k_base().expect("tokenizer available"),
             HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).expect("model available"),
         ),

@@ -1,9 +1,9 @@
 use std::time::Duration;
 
 use bpe::appendable_encoder::AppendableEncoder;
-use bpe::byte_pair_encoding::create_test_bytes;
 use bpe::interval_encoding::IntervalEncoding;
 use bpe_benchmarks::*;
+use bpe_tests::create_test_bytes;
 use criterion::{
     criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration,
 };

@@ -87,21 +87,3 @@ impl<'a> AppendableEncoder<'a> {
         self.states.is_empty()
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use crate::byte_pair_encoding::{create_test_bytes, BPE_CL100K};
-
-    use super::AppendableEncoder;
-
-    #[test]
-    fn test_appendable_encoder() {
-        let bpe = &BPE_CL100K;
-        let mut enc = AppendableEncoder::new(bpe);
-        let input_string = create_test_bytes(bpe, 100);
-        for (i, c) in input_string.iter().enumerate() {
-            assert_eq!(enc.token_count(), bpe.count(&input_string[0..i]));
-            enc.push(*c);
-        }
-    }
-}