Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade tiktoken-rs #29

Merged
merged 6 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
members = [
"crates/*",
"crates/bpe/benchmarks",
"crates/bpe/tests",
]
resolver = "2"

Expand Down
10 changes: 5 additions & 5 deletions crates/bpe-openai/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ bpe = { version = "0.1.0", path = "../bpe" }
either = "1.13"
fancy-regex = "0.13"
rmp-serde = "1"
serde = { version = "1" }

[dev-dependencies]
tiktoken-rs = { version = "0.5" }
tiktoken-rs = "0.6"

[build-dependencies]
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
base64 = "0.22.1"
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken"] }
flate2 = "1.0"
rmp-serde = "1"
tiktoken-rs = { version = "0.5" }
serde = { version = "1" }
serde = "1"
46 changes: 16 additions & 30 deletions crates/bpe-openai/build.rs
Original file line number Diff line number Diff line change
@@ -1,51 +1,37 @@
use std::env;
use std::fs::File;
use std::io::Read;
use std::path::PathBuf;

use bpe::byte_pair_encoding::BytePairEncoding;
use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
use serde::Serialize;
use tiktoken_rs::CoreBPE;

fn main() {
serialize_tokens(
"r50k",
&tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
50256,
1,
);
serialize_tokens(
"p50k",
&tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
50280,
1,
);
serialize_tokens(
"cl100k",
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
100256,
17846336922010275747,
);
serialize_tokens(
"cl100k",
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
100256,
serialize_tiktoken_bpe("r50k_base", include_bytes!("data/r50k_base.tiktoken.gz"), 1);
serialize_tiktoken_bpe("p50k_base", include_bytes!("data/p50k_base.tiktoken.gz"), 1);
serialize_tiktoken_bpe(
"cl100k_base",
include_bytes!("data/cl100k_base.tiktoken.gz"),
17846336922010275747,
);
serialize_tokens(
"o200k",
&tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"),
199998,
serialize_tiktoken_bpe(
"o200k_base",
include_bytes!("data/o200k_base.tiktoken.gz"),
17846336922010275747,
);
println!("cargo::rerun-if-changed=build.rs");
}

fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) {
fn serialize_tiktoken_bpe(name: &str, data: &[u8], hash_factor: u64) {
let mut dec = flate2::read::GzDecoder::new(data);
let mut tiktoken = String::new();
dec.read_to_string(&mut tiktoken).expect("can decode data");
let tokens = read_tiktoken(&tiktoken).expect("can read data");
let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
path.push(format!("bpe_{name}.dict"));
let file = File::create(path).expect("can create output file");
let mut serializer = rmp_serde::Serializer::new(file);
let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor));
let bpe = BytePairEncoding::from_dictionary(tokens, Some(hash_factor));
bpe.serialize(&mut serializer)
.expect("serialization succeeds");
}
Binary file added crates/bpe-openai/data/cl100k_base.tiktoken.gz
Binary file not shown.
Binary file added crates/bpe-openai/data/o200k_base.tiktoken.gz
Binary file not shown.
Binary file added crates/bpe-openai/data/p50k_base.tiktoken.gz
Binary file not shown.
Binary file added crates/bpe-openai/data/r50k_base.tiktoken.gz
Binary file not shown.
45 changes: 22 additions & 23 deletions crates/bpe-openai/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,29 @@ use bpe::byte_pair_encoding::BytePairEncoding;
use either::Either;
use fancy_regex::Regex;

static BPE_R50K: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
});

static BPE_P50K: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
});

static BPE_CL100K: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
});

static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k_base.dict"));
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
let pat = [
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
Expand Down Expand Up @@ -91,20 +91,20 @@ impl Tokenizer {
}
}

pub fn r50k() -> &'static Tokenizer {
&BPE_R50K
pub fn r50k_base() -> &'static Tokenizer {
&BPE_R50K_BASE
}

pub fn p50k() -> &'static Tokenizer {
&BPE_P50K
pub fn p50k_base() -> &'static Tokenizer {
&BPE_P50K_BASE
}

pub fn cl100k() -> &'static Tokenizer {
&BPE_CL100K
pub fn cl100k_base() -> &'static Tokenizer {
&BPE_CL100K_BASE
}

pub fn o200k() -> &'static Tokenizer {
&BPE_O200K
pub fn o200k_base() -> &'static Tokenizer {
&BPE_O200K_BASE
}

#[cfg(test)]
Expand All @@ -115,22 +115,22 @@ mod tests {

#[test]
fn can_load_r50k() {
r50k().count("");
r50k_base().count("");
}

#[test]
fn can_load_p50k() {
p50k().count("");
p50k_base().count("");
}

#[test]
fn can_load_cl100k() {
cl100k().count("");
cl100k_base().count("");
}

#[test]
fn can_load_o200k() {
o200k().count("");
o200k_base().count("");
}

/// Test demonstrating a case where input splitting makes a difference.
Expand All @@ -142,13 +142,12 @@ mod tests {
.lock()
.encode_ordinary(text)
.into_iter()
.map(|i| i as u32)
.collect();

let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input);
let without_splitting = BPE_CL100K_BASE.bpe.encode_via_backtracking(input);
assert_ne!(without_splitting, expected);

let with_splitting: Vec<_> = BPE_CL100K.encode(text);
let with_splitting: Vec<_> = BPE_CL100K_BASE.encode(text);
assert_eq!(with_splitting, expected);
}
}
39 changes: 39 additions & 0 deletions crates/bpe/CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Contributing

Here are specific details that are useful when you want to contribute to the BPE crates.
Make sure to read the repository's [contribution guidelines][contributing] as well.

## Project structure

This project has a slightly unusual structure to resolve some dependency issues.

- This directory contains `bpe`, the BPE code itself.
- A sibling directory contains `bpe-openai`, which exposes tokenizers for OpenAI token sets, and depends on `bpe`.
- Tests are located in the `tests` subdirectory, and benchmarks in the `benchmarks` subdirectory. Both of these are separate crates so they can depend on `bpe-openai` without causing a cyclic dependency.

Only the `bpe` and `bpe-openai` crates are meant to be published. The other ones are for development use only.

## Running benchmarks

Change the working directory to the `benchmarks` directory:

```sh
cd benchmarks
```

Run the benchmark as follows (required [cargo-criterion](https://crates.io/crates/cargo-criterion) installed):

```sh
cargo criterion
```

(Using `cargo bench` ignores the settings in `criterion.toml`!)
Open the full report which should be located in `target/criterion/reports/index.html`.

Update the figures in this repo as follows (requires `rsvg-convert` from `librsvg` installed):

```sh
script/copy-results
```

[contributing]: ../../CONTRIBUTING.md
11 changes: 7 additions & 4 deletions crates/bpe/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,19 @@ bench = false

[features]
rand = ["dep:rand"]
tiktoken-rs = ["dep:tiktoken-rs"]
tiktoken = ["dep:base64"]

[dependencies]
aneubeck-daachorse = "1.1.1"
base64 = { version = "0.22", optional = true }
fnv = "1.0"
itertools = "0.12"
rand = { version = "0.8", optional = true }
rmp-serde = "1"
serde = { version = "1", features = ["derive"] }
tiktoken-rs = { version = "0.5", optional = true }

[dev-dependencies]
bpe = { path = ".", features = ["rand", "tiktoken-rs"] }
bpe = { path = "." }
tiktoken-rs = "0.6"

[package.metadata.docs.rs]
all-features = true
23 changes: 0 additions & 23 deletions crates/bpe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -296,26 +296,3 @@ The performance of tiktoken shows a quadratic growth with the input size.
The Huggingface encoder scales better, but becomes slower and slower compared to our implementation as input size increases.

![worst-case encoding runtime comparison](./images/performance-worstcase.svg)

### Running the benchmarks

Benchmarks are located in a separate crate in the `benchmarks` directory.

```sh
cd benchmarks
```

Run the benchmark as follows (required [cargo-criterion](https://crates.io/crates/cargo-criterion) installed):

```sh
cargo criterion
```

(Using `cargo bench` ignores the settings in `criterion.toml`!)
Open the full report which should be located in `target/criterion/reports/index.html`.

Update the figures in this repo as follows (requires `rsvg-convert` from `librsvg` installed):

```sh
script/copy-results
```
5 changes: 3 additions & 2 deletions crates/bpe/benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ path = "equivalence.rs"
test = true

[dependencies]
bpe = { path = "../../bpe", features = ["rand", "tiktoken-rs"] }
bpe = { path = "../../bpe" }
bpe-openai = { path = "../../bpe-openai" }
bpe-tests = { path = "../tests" }
criterion = "0.5"
rand = "0.8"
tiktoken-rs = "0.5"
tiktoken-rs = "0.6"
tokenizers = { version = "0.20", features = ["http"] }
8 changes: 4 additions & 4 deletions crates/bpe/benchmarks/equivalence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ fn test_encoding_equivalence_without_pretokenization() {
for input in inputs {
let text = std::str::from_utf8(input).unwrap();
let out = bpe.bpe.encode_via_backtracking(input);
let huggingface_out: Vec<_> = huggingface
let huggingface_out = huggingface
.encode_fast(text, false)
.unwrap()
.get_ids()
Expand Down Expand Up @@ -52,10 +52,10 @@ fn test_encoding_equivalence_with_pretokenization() {
for input in inputs {
let text = std::str::from_utf8(input).unwrap();
let out = bpe.encode(text);
let tiktoken_out: Vec<_> = tiktoken.encode_ordinary(text);
let tiktoken_out2: Vec<_> = tiktoken_out.iter().map(|i| *i as u32).collect();
let tiktoken_out = tiktoken.encode_ordinary(text);
let tiktoken_out2 = tiktoken_out.to_vec();
let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap();
let huggingface_out: Vec<_> = huggingface
let huggingface_out = huggingface
.encode_fast(text, false)
.unwrap()
.get_ids()
Expand Down
4 changes: 2 additions & 2 deletions crates/bpe/benchmarks/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ pub static TOKENIZERS: LazyLock<
[
(
"cl100k",
bpe_openai::cl100k(),
bpe_openai::cl100k_base(),
tiktoken_rs::cl100k_base().expect("tokenizer available"),
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).expect("model available"),
),
(
"o200k",
bpe_openai::o200k(),
bpe_openai::o200k_base(),
tiktoken_rs::o200k_base().expect("tokenizer available"),
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).expect("model available"),
),
Expand Down
2 changes: 1 addition & 1 deletion crates/bpe/benchmarks/performance.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use std::time::Duration;

use bpe::appendable_encoder::AppendableEncoder;
use bpe::byte_pair_encoding::create_test_bytes;
use bpe::interval_encoding::IntervalEncoding;
use bpe_benchmarks::*;
use bpe_tests::create_test_bytes;
use criterion::{
criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration,
};
Expand Down
18 changes: 0 additions & 18 deletions crates/bpe/src/appendable_encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,21 +87,3 @@ impl<'a> AppendableEncoder<'a> {
self.states.is_empty()
}
}

#[cfg(test)]
mod tests {
use crate::byte_pair_encoding::{create_test_bytes, BPE_CL100K};

use super::AppendableEncoder;

#[test]
fn test_appendable_encoder() {
let bpe = &BPE_CL100K;
let mut enc = AppendableEncoder::new(bpe);
let input_string = create_test_bytes(bpe, 100);
for (i, c) in input_string.iter().enumerate() {
assert_eq!(enc.token_count(), bpe.count(&input_string[0..i]));
enc.push(*c);
}
}
}
Loading