Skip to content

Commit

Permalink
Merge pull request #26 from github/initial-release
Browse files Browse the repository at this point in the history
  • Loading branch information
hendrikvanantwerpen authored Oct 8, 2024
2 parents 8f53c50 + 0fad66c commit ee843cd
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 1 deletion.
6 changes: 5 additions & 1 deletion crates/bpe-openai/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bpe-openai"
version = "0.1.0"
version = "0.1.1"
edition = "2021"
description = "Prebuilt fast byte-pair encoders for OpenAI."
repository = "https://github.com/github/rust-gems"
Expand All @@ -17,6 +17,10 @@ bpe = { version = "0.1.0", path = "../bpe" }
rmp-serde = "1"
serde = { version = "1" }

[dev-dependencies]
fancy-regex = "0.13"
tiktoken-rs = { version = "0.5" }

[build-dependencies]
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
rmp-serde = "1"
Expand Down
4 changes: 4 additions & 0 deletions crates/bpe-openai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ Supported token sets:
- cl100k
- o200k

> **⚠ CAUTION ⚠**
> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding.
> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken.
## Usage

Add a dependency by running
Expand Down
31 changes: 31 additions & 0 deletions crates/bpe-openai/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ pub fn o200k() -> &'static BytePairEncoding {

#[cfg(test)]
mod tests {
use tiktoken_rs::cl100k_base_singleton;

use super::*;

#[test]
Expand All @@ -63,4 +65,33 @@ mod tests {
fn can_load_o200k() {
o200k().count("".as_bytes());
}

/// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting.
#[test]
fn splitting_difference() {
let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
let input = text.as_bytes();
let expected: Vec<_> = cl100k_base_singleton()
.lock()
.encode_ordinary(text)
.into_iter()
.map(|i| i as u32)
.collect();

let without_splitting = BPE_CL100K.encode_via_backtracking(input);
assert_ne!(without_splitting, expected);

let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
let re = fancy_regex::Regex::new(pat).unwrap();
println!("{}", re.find_iter(text).count());
let with_splitting: Vec<_> = re
.find_iter(text)
.flat_map(|piece| {
BPE_CL100K
.encode_via_backtracking(piece.unwrap().as_str().as_bytes())
.into_iter()
})
.collect();
assert_eq!(with_splitting, expected);
}
}

0 comments on commit ee843cd

Please sign in to comment.