Skip to content

Commit b21fc80

Browse files
Generate non-splittable test string
1 parent 1fca5e9 commit b21fc80

File tree

5 files changed

+79
-53
lines changed

5 files changed

+79
-53
lines changed

crates/bpe/README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -294,9 +294,8 @@ This suggests that pre-tokenization is not necessary from a performance perspect
294294

295295
![encoding runtime comparison](./images/performance-comparison.svg)
296296

297-
The graph below shows encoding results for input that is particularly challenging for tiktoken.
298-
The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
299-
The performance of tiktoken shows a quadratic growth with the input size.
297+
The graph below shows encoding results when the input cannot be split in pre-tokenization and allows a better comparison of pure BPE performance.
298+
This case is particularly challenging for tiktoken, which shows a quadratic growth with the input size.
300299
The Huggingface encoder scales better, but becomes slower and slower compared to our implementation as input size increases.
301300

302301
![worst-case encoding runtime comparison](./images/performance-worstcase.svg)

crates/bpe/benchmarks/performance.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
use std::time::Duration;
22

33
use bpe::appendable_encoder::AppendableEncoder;
4-
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
4+
use bpe::byte_pair_encoding::{
5+
create_test_string, create_test_string_with_predicate, select_test_string,
6+
};
57
use bpe::interval_encoding::IntervalEncoding;
68
use bpe_benchmarks::*;
79
use criterion::{
@@ -11,7 +13,7 @@ use rand::{thread_rng, Rng};
1113

1214
fn counting_benchmark(c: &mut Criterion) {
1315
for (name, bpe, _, _) in TOKENIZERS.iter() {
14-
let input = create_test_string(&bpe.bpe, 80000);
16+
let input = create_test_string(&bpe.bpe, 80_000);
1517
let fast = IntervalEncoding::new(&bpe.bpe, input.as_bytes());
1618

1719
let mut group = c.benchmark_group(format!("counting-{name}"));
@@ -185,19 +187,21 @@ fn comparison_benchmark(c: &mut Criterion) {
185187
}
186188

187189
fn worstcase_comparison_benchmark(c: &mut Criterion) {
188-
for (name, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
189-
let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect();
190+
for (name, tok, tiktoken, huggingface) in TOKENIZERS.iter() {
191+
let text = create_test_string_with_predicate(&tok.bpe, 100000, |text| {
192+
tok.split(text).nth(1).is_none()
193+
});
190194

191195
let mut group = c.benchmark_group(format!("worstcase-{name}"));
192-
for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] {
196+
for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000] {
193197
group.throughput(criterion::Throughput::Bytes(bytes as u64));
194198
group.bench_with_input(
195199
BenchmarkId::new("backtracking", bytes),
196200
&bytes,
197201
|b, bytes| {
198202
b.iter_batched(
199203
|| select_test_string(&text, *bytes),
200-
|text| bpe.encode(text),
204+
|text| tok.encode(text),
201205
criterion::BatchSize::SmallInput,
202206
)
203207
},

0 commit comments

Comments
 (0)