openai · alobroke · May 20, 2026 · May 20, 2026 · May 24, 2026
diff --git a/scripts/benchmark_tail_latency.py b/scripts/benchmark_tail_latency.py
@@ -0,0 +1,177 @@
+"""
+benchmark_tail_latency.py — tail-latency harness for encode_ordinary_batch.
+
+Measures median and worst-of-N wall-clock time across multiple synthetic
+corpora and thread counts. Reproduces the measurement methodology from
+https://github.com/openai/tiktoken/issues/530
+
+Usage:
+    python scripts/benchmark_tail_latency.py [--runs N] [--batch-size N] [--encoding NAME]
+
+Example:
+    python scripts/benchmark_tail_latency.py --runs 10 --batch-size 256 --encoding o200k_base
+"""
+
+from __future__ import annotations
+
+import argparse
+import statistics
+import time
+
+import tiktoken
+
+# ---------------------------------------------------------------------------
+# Synthetic corpora
+# Each entry is (name, generator_fn).
+# We generate corpora at runtime so the script ships without data files.
+# ---------------------------------------------------------------------------
+
+def _english_prose(n_chars: int) -> str:
+    words = (
+        "the quick brown fox jumps over the lazy dog "
+        "OpenAI language model tokenizer benchmark test "
+    ) * (n_chars // 80 + 1)
+    return words[:n_chars]
+
+
+def _python_source(n_chars: int) -> str:
+    snippet = (
+        "def encode_batch(texts, enc):\n"
+        "    return [enc.encode_ordinary(t) for t in texts]\n\n"
+        "class Tokenizer:\n"
+        "    def __init__(self, model):\n"
+        "        self.enc = tiktoken.get_encoding(model)\n\n"
+    ) * (n_chars // 150 + 1)
+    return snippet[:n_chars]
+
+
+def _multilingual(n_chars: int) -> str:
+    chars = (
+        "日本語テスト "
+        "Héllo wörld "
+        "Привет мир "
+        "مرحبا بالعالم "
+        "🌍🚀✨ "
+        "中文测试 "
+    ) * (n_chars // 50 + 1)
+    return chars[:n_chars]
+
+
+def _random_bytes_as_latin1(n_chars: int) -> str:
+    # Reproducible pseudo-random latin-1 safe bytes
+    import random
+    rng = random.Random(42)
+    chars = [chr(rng.randint(32, 126)) for _ in range(n_chars)]
+    return "".join(chars)
+
+
+CORPORA: list[tuple[str, int]] = [
+    ("english prose",      40_000),
+    ("python source",      80_000),
+    ("multilingual+emoji", 90_000),
+    ("random ascii",      120_000),
+]
+
+CORPUS_GENERATORS = {
+    "english prose":      _english_prose,
+    "python source":      _python_source,
+    "multilingual+emoji": _multilingual,
+    "random ascii":       _random_bytes_as_latin1,
+}
+
+# ---------------------------------------------------------------------------
+# Benchmark runner
+# ---------------------------------------------------------------------------
+
+def run_benchmark(
+    encoding_name: str,
+    batch_size: int,
+    runs: int,
+    thread_counts: list[int],
+) -> None:
+    print(f"\nencoding: {encoding_name}  |  batch_size: {batch_size}  |  runs: {runs}")
+
+    enc = tiktoken.get_encoding(encoding_name)
+    # Warm up the encoding (loads vocab, compiles regex, etc.)
+    enc.encode_ordinary_batch(["warmup"] * 4, num_threads=1)
+
+    for num_threads in thread_counts:
+        print(f"\n── num_threads={num_threads} " + "─" * 50)
+        print(
+            f"{'corpus':<24} {'tokens/batch':>14} "
+            f"{'median ms':>10} {'worst ms':>10} {'worst/med':>10}"
+        )
+        print("-" * 74)
+
+        for corpus_name, n_chars in CORPORA:
+            text = CORPUS_GENERATORS[corpus_name](n_chars)
+            docs = [text] * batch_size
+
+            # Count tokens once
+            sample_tokens = enc.encode_ordinary(text)
+            tokens_per_batch = len(sample_tokens) * batch_size
+
+            # Collect timing samples
+            samples: list[float] = []
+            for _ in range(runs):
+                t0 = time.perf_counter()
+                enc.encode_ordinary_batch(docs, num_threads=num_threads)
+                samples.append(time.perf_counter() - t0)
+
+            samples.sort()
+            median_ms = statistics.median(samples) * 1000
+            worst_ms  = samples[-1] * 1000
+            ratio     = samples[-1] / statistics.median(samples)
+
+            print(
+                f"{corpus_name:<24} {tokens_per_batch:>14,} "
+                f"{median_ms:>10.0f} {worst_ms:>10.0f} {ratio:>9.1f}x"
+            )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Tail-latency benchmark for tiktoken encode_ordinary_batch"
+    )
+    parser.add_argument(
+        "--runs",
+        type=int,
+        default=10,
+        help="Number of timed runs per corpus (default: 10)",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        help="Number of documents per batch (default: 64)",
+    )
+    parser.add_argument(
+        "--encoding",
+        type=str,
+        default="o200k_base",
+        help="tiktoken encoding name (default: o200k_base)",
+    )
+    parser.add_argument(
+        "--threads",
+        type=str,
+        default="1,4,8",
+        help="Comma-separated thread counts to benchmark (default: 1,4,8)",
+    )
+    args = parser.parse_args()
+
+    thread_counts = [int(t) for t in args.threads.split(",")]
+
+    run_benchmark(
+        encoding_name=args.encoding,
+        batch_size=args.batch_size,
+        runs=args.runs,
+        thread_counts=thread_counts,
+    )
+
+
+if __name__ == "__main__":
+    main()