Skip to content

Commit

Permalink
Generate non-splittable test string
Browse files Browse the repository at this point in the history
  • Loading branch information
hendrikvanantwerpen committed Oct 18, 2024
1 parent 1fca5e9 commit b21fc80
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 53 deletions.
5 changes: 2 additions & 3 deletions crates/bpe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,9 +294,8 @@ This suggests that pre-tokenization is not necessary from a performance perspect

![encoding runtime comparison](./images/performance-comparison.svg)

The graph below shows encoding results for input that is particularly challenging for tiktoken.
The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
The performance of tiktoken shows a quadratic growth with the input size.
The graph below shows encoding results when the input cannot be split in pre-tokenization and allows a better comparison of pure BPE performance.
This case is particularly challenging for tiktoken, which shows a quadratic growth with the input size.
The Huggingface encoder scales better, but becomes slower and slower compared to our implementation as input size increases.

![worst-case encoding runtime comparison](./images/performance-worstcase.svg)
16 changes: 10 additions & 6 deletions crates/bpe/benchmarks/performance.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use std::time::Duration;

use bpe::appendable_encoder::AppendableEncoder;
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
use bpe::byte_pair_encoding::{
create_test_string, create_test_string_with_predicate, select_test_string,
};
use bpe::interval_encoding::IntervalEncoding;
use bpe_benchmarks::*;
use criterion::{
Expand All @@ -11,7 +13,7 @@ use rand::{thread_rng, Rng};

fn counting_benchmark(c: &mut Criterion) {
for (name, bpe, _, _) in TOKENIZERS.iter() {
let input = create_test_string(&bpe.bpe, 80000);
let input = create_test_string(&bpe.bpe, 80_000);
let fast = IntervalEncoding::new(&bpe.bpe, input.as_bytes());

let mut group = c.benchmark_group(format!("counting-{name}"));
Expand Down Expand Up @@ -185,19 +187,21 @@ fn comparison_benchmark(c: &mut Criterion) {
}

fn worstcase_comparison_benchmark(c: &mut Criterion) {
for (name, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect();
for (name, tok, tiktoken, huggingface) in TOKENIZERS.iter() {
let text = create_test_string_with_predicate(&tok.bpe, 100000, |text| {
tok.split(text).nth(1).is_none()
});

let mut group = c.benchmark_group(format!("worstcase-{name}"));
for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] {
for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000] {
group.throughput(criterion::Throughput::Bytes(bytes as u64));
group.bench_with_input(
BenchmarkId::new("backtracking", bytes),
&bytes,
|b, bytes| {
b.iter_batched(
|| select_test_string(&text, *bytes),
|text| bpe.encode(text),
|text| tok.encode(text),
criterion::BatchSize::SmallInput,
)
},
Expand Down
Loading

0 comments on commit b21fc80

Please sign in to comment.