From a6df865835a420e79f80ad08fb119350c53ef7c1 Mon Sep 17 00:00:00 2001 From: Hendrik van Antwerpen Date: Thu, 17 Oct 2024 20:36:54 +0200 Subject: [PATCH] Update README --- crates/bpe/README.md | 5 +- crates/bpe/images/performance-appending.svg | 20 ++--- crates/bpe/images/performance-comparison.svg | 94 ++++++++++---------- crates/bpe/images/performance-counting.svg | 20 ++--- crates/bpe/images/performance-encoding.svg | 60 ++++++------- crates/bpe/images/performance-worstcase.svg | 91 ++++++++++--------- 6 files changed, 150 insertions(+), 140 deletions(-) diff --git a/crates/bpe/README.md b/crates/bpe/README.md index d083fd4..0dcb703 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -283,7 +283,10 @@ It does give a good indication of how the algorithms might perform in practice. The graph below shows encoding runtime vs slice length. All encoders show a similar runtime complexity. -The backtracking encoder and tiktoken have comparable performance, and both are about 3.5--4x faster than the Huggingface encoder. +The backtracking encoder is about 3x faster than tiktoken. +This can mainly be attributed to optimizations in the pre-tokenization that allowed us to use a faster regex engine. +Without those, their performance is comparable. +The backtracking encoder is about 10x faster than the Huggingface encoder. An interesting observation here is that pre-tokenization slows down encoding quite a bit. Compared with the encoding benchmark above, the backtracking encoder without pre-tokenization is almost 4x faster than the one with pre-tokenization in this benchmark. diff --git a/crates/bpe/images/performance-appending.svg b/crates/bpe/images/performance-appending.svg index 68b4865..676cb86 100644 --- a/crates/bpe/images/performance-appending.svg +++ b/crates/bpe/images/performance-appending.svg @@ -34,17 +34,17 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/images/performance-comparison.svg b/crates/bpe/images/performance-comparison.svg index ec6c3b7..2bf73f9 100644 --- a/crates/bpe/images/performance-comparison.svg +++ b/crates/bpe/images/performance-comparison.svg @@ -1,54 +1,58 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - + + + + + - - - - - + + + + + - - - - - - + + + + + + - - + + diff --git a/crates/bpe/images/performance-counting.svg b/crates/bpe/images/performance-counting.svg index d3d5296..10cee76 100644 --- a/crates/bpe/images/performance-counting.svg +++ b/crates/bpe/images/performance-counting.svg @@ -30,17 +30,17 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/images/performance-encoding.svg b/crates/bpe/images/performance-encoding.svg index ff8ec1a..610b7ba 100644 --- a/crates/bpe/images/performance-encoding.svg +++ b/crates/bpe/images/performance-encoding.svg @@ -34,41 +34,41 @@ - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/images/performance-worstcase.svg b/crates/bpe/images/performance-worstcase.svg index 03f6d3f..4f54c5c 100644 --- a/crates/bpe/images/performance-worstcase.svg +++ b/crates/bpe/images/performance-worstcase.svg @@ -4,24 +4,27 @@ - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + - + @@ -46,38 +49,38 @@ - - - - - - - - - - + + + + + + + + + + - + - - - - - - - - + + + + + + + + - + - - - - - - - - + + + + + + + +