1
1
use std:: time:: Duration ;
2
2
3
3
use bpe:: appendable_encoder:: AppendableEncoder ;
4
- use bpe:: byte_pair_encoding:: { create_test_string, select_test_string} ;
4
+ use bpe:: byte_pair_encoding:: {
5
+ create_test_string, create_test_string_with_predicate, select_test_string,
6
+ } ;
5
7
use bpe:: interval_encoding:: IntervalEncoding ;
6
8
use bpe_benchmarks:: * ;
7
9
use criterion:: {
@@ -11,7 +13,7 @@ use rand::{thread_rng, Rng};
11
13
12
14
fn counting_benchmark ( c : & mut Criterion ) {
13
15
for ( name, bpe, _, _) in TOKENIZERS . iter ( ) {
14
- let input = create_test_string ( & bpe. bpe , 80000 ) ;
16
+ let input = create_test_string ( & bpe. bpe , 80_000 ) ;
15
17
let fast = IntervalEncoding :: new ( & bpe. bpe , input. as_bytes ( ) ) ;
16
18
17
19
let mut group = c. benchmark_group ( format ! ( "counting-{name}" ) ) ;
@@ -185,19 +187,21 @@ fn comparison_benchmark(c: &mut Criterion) {
185
187
}
186
188
187
189
fn worstcase_comparison_benchmark ( c : & mut Criterion ) {
188
- for ( name, bpe, tiktoken, huggingface) in TOKENIZERS . iter ( ) {
189
- let text: String = ( '\0' ..char:: MAX ) . filter ( |c| !c. is_whitespace ( ) ) . collect ( ) ;
190
+ for ( name, tok, tiktoken, huggingface) in TOKENIZERS . iter ( ) {
191
+ let text = create_test_string_with_predicate ( & tok. bpe , 100000 , |text| {
192
+ tok. split ( text) . nth ( 1 ) . is_none ( )
193
+ } ) ;
190
194
191
195
let mut group = c. benchmark_group ( format ! ( "worstcase-{name}" ) ) ;
192
- for bytes in [ 10 , 100 , 1000 , 5000 , 10000 , 25000 , 50000 , 75000 , 100000 ] {
196
+ for bytes in [ 10 , 100 , 1000 , 5000 , 10000 , 25000 , 50000 ] {
193
197
group. throughput ( criterion:: Throughput :: Bytes ( bytes as u64 ) ) ;
194
198
group. bench_with_input (
195
199
BenchmarkId :: new ( "backtracking" , bytes) ,
196
200
& bytes,
197
201
|b, bytes| {
198
202
b. iter_batched (
199
203
|| select_test_string ( & text, * bytes) ,
200
- |text| bpe . encode ( text) ,
204
+ |text| tok . encode ( text) ,
201
205
criterion:: BatchSize :: SmallInput ,
202
206
)
203
207
} ,
0 commit comments