diff --git a/pingora-lru/Cargo.toml b/pingora-lru/Cargo.toml index 3eae82b9..c5ce8bcd 100644 --- a/pingora-lru/Cargo.toml +++ b/pingora-lru/Cargo.toml @@ -24,6 +24,7 @@ rand = "0.8" [dev-dependencies] lru = { workspace = true } +criterion = { version = "0.5", features = ["html_reports"] } [[bench]] name = "bench_linked_list" diff --git a/pingora-lru/benches/bench_linked_list.rs b/pingora-lru/benches/bench_linked_list.rs index 5fc0e50a..8af84a46 100644 --- a/pingora-lru/benches/bench_linked_list.rs +++ b/pingora-lru/benches/bench_linked_list.rs @@ -12,133 +12,225 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::time::Instant; - -fn main() { - const ITEMS: usize = 5_000_000; +use criterion::{criterion_group, criterion_main, BatchSize, Criterion, Throughput}; +use pingora_lru::linked_list::LinkedList; +use std::hint::black_box; + +/// Number of items pre-loaded into the list for benchmarks that measure +/// a single operation against an existing list (promote, search, iter). +const BENCH_SIZE: usize = 1_000_000; + +/// Smaller size used for bulk-pop benchmarks so each Criterion iteration +/// completes in a reasonable time while still being representative. +const POP_BATCH_SIZE: usize = 100_000; + +// --------------------------------------------------------------------------- +// push +// --------------------------------------------------------------------------- + +/// Measures the cost of a single push onto a growing list. +/// Both lists start empty and grow without bound across iterations — +/// this is intentional: we want to capture steady-state push cost, not +/// amortised allocation cost. +fn bench_push(c: &mut Criterion) { + let mut group = c.benchmark_group("linked_list/push"); + group.throughput(Throughput::Elements(1)); + + group.bench_function("std_push_front", |b| { + let mut list = std::collections::LinkedList::::new(); + b.iter(|| { + list.push_front(black_box(42u64)); + }); + }); + + group.bench_function("pingora_push_head", |b| { + let mut list = LinkedList::with_capacity(BENCH_SIZE); + b.iter(|| { + list.push_head(black_box(42u64)); + }); + }); + + group.finish(); +} - // push bench +// --------------------------------------------------------------------------- +// iter +// --------------------------------------------------------------------------- +/// Measures a complete traversal of a pre-built list. +/// The list is constructed once and reused across all Criterion iterations. +fn bench_iter(c: &mut Criterion) { let mut std_list = std::collections::LinkedList::::new(); - let before = Instant::now(); - for _ in 0..ITEMS { - std_list.push_front(0); - } - let elapsed = before.elapsed(); - println!( - "std linked list push_front total {elapsed:?}, {:?} avg per operation", - elapsed / ITEMS as u32 - ); - - let mut list = pingora_lru::linked_list::LinkedList::with_capacity(ITEMS); - let before = Instant::now(); - for _ in 0..ITEMS { - list.push_head(0); + let mut pingora_list = LinkedList::with_capacity(BENCH_SIZE); + for i in 0..BENCH_SIZE as u64 { + std_list.push_front(i); + pingora_list.push_head(i); } - let elapsed = before.elapsed(); - println!( - "pingora linked list push_head total {elapsed:?}, {:?} avg per operation", - elapsed / ITEMS as u32 - ); - - // iter bench - - let mut count = 0; - let before = Instant::now(); - for _ in std_list.iter() { - count += 1; - } - let elapsed = before.elapsed(); - println!( - "std linked list iter total {count} {elapsed:?}, {:?} avg per operation", - elapsed / count as u32 - ); - - let mut count = 0; - let before = Instant::now(); - for _ in list.iter() { - count += 1; - } - let elapsed = before.elapsed(); - println!( - "pingora linked list iter total {count} {elapsed:?}, {:?} avg per operation", - elapsed / count as u32 - ); - // search bench + let mut group = c.benchmark_group("linked_list/iter"); + group.throughput(Throughput::Elements(BENCH_SIZE as u64)); + + group.bench_function("std_iter", |b| { + b.iter(|| { + // Fold to prevent the compiler from eliding the iteration. + let sum: u64 = std_list.iter().fold(0u64, |acc, v| acc.wrapping_add(*v)); + black_box(sum); + }); + }); + + group.bench_function("pingora_iter", |b| { + b.iter(|| { + let sum: u64 = pingora_list + .iter() + .fold(0u64, |acc, v| acc.wrapping_add(*v)); + black_box(sum); + }); + }); + + group.finish(); +} - let before = Instant::now(); - for _ in 0..ITEMS { - assert!(!std_list.iter().take(10).any(|v| *v == 1)); - } - let elapsed = before.elapsed(); - println!( - "std linked search first 10 items total {elapsed:?}, {:?} avg per operation", - elapsed / ITEMS as u32 - ); - - let before = Instant::now(); - for _ in 0..ITEMS { - assert!(!list.iter().take(10).any(|v| *v == 1)); - } - let elapsed = before.elapsed(); - println!( - "pingora linked search first 10 items total {elapsed:?}, {:?} avg per operation", - elapsed / ITEMS as u32 - ); - - let before = Instant::now(); - for _ in 0..ITEMS { - assert!(!list.exist_near_head(1, 10)); - } - let elapsed = before.elapsed(); - println!( - "pingora linked optimized search first 10 items total {elapsed:?}, {:?} avg per operation", - elapsed / ITEMS as u32 - ); - - // move node bench - let before = Instant::now(); - for _ in 0..ITEMS { - let value = std_list.pop_back().unwrap(); - std_list.push_front(value); - } - let elapsed = before.elapsed(); - println!( - "std linked list move back to front total {elapsed:?}, {:?} avg per operation", - elapsed / ITEMS as u32 - ); - - let before = Instant::now(); - for _ in 0..ITEMS { - let index = list.tail().unwrap(); - list.promote(index); +// --------------------------------------------------------------------------- +// search (first 10 elements) +// --------------------------------------------------------------------------- + +/// Compares three ways to check whether a value exists within the first +/// 10 nodes of the list: std linear scan, pingora linear scan, and +/// pingora's optimised `exist_near_head` intrinsic. +fn bench_search(c: &mut Criterion) { + let mut std_list = std::collections::LinkedList::::new(); + let mut pingora_list = LinkedList::with_capacity(BENCH_SIZE); + // Value 1 is not near the head (head holds BENCH_SIZE-1), so every + // search terminates after 10 comparisons — the worst case for the + // "first 10" check. + for i in 0..BENCH_SIZE as u64 { + std_list.push_front(i); + pingora_list.push_head(i); } - let elapsed = before.elapsed(); - println!( - "pingora linked list move tail to head total {elapsed:?}, {:?} avg per operation", - elapsed / ITEMS as u32 - ); - // pop bench + let mut group = c.benchmark_group("linked_list/search_first_10"); + group.throughput(Throughput::Elements(1)); - let before = Instant::now(); - for _ in 0..ITEMS { - std_list.pop_back(); - } - let elapsed = before.elapsed(); - println!( - "std linked list pop_back {elapsed:?}, {:?} avg per operation", - elapsed / ITEMS as u32 - ); - - let before = Instant::now(); - for _ in 0..ITEMS { - list.pop_tail(); - } - let elapsed = before.elapsed(); - println!( - "pingora linked list pop_tail total {elapsed:?}, {:?} avg per operation", - elapsed / ITEMS as u32 - ); + group.bench_function("std_linear_search", |b| { + b.iter(|| { + black_box(std_list.iter().take(10).any(|v| *v == 1)); + }); + }); + + group.bench_function("pingora_linear_search", |b| { + b.iter(|| { + black_box(pingora_list.iter().take(10).any(|v| *v == 1)); + }); + }); + + group.bench_function("pingora_exist_near_head", |b| { + b.iter(|| { + black_box(pingora_list.exist_near_head(1, 10)); + }); + }); + + group.finish(); } + +// --------------------------------------------------------------------------- +// promote (move tail to head) +// --------------------------------------------------------------------------- + +/// Measures a single tail→head move on a stable-sized list. +/// The std implementation has no dedicated promote: it does pop_back + +/// push_front. Pingora has a zero-copy `promote` that only rewires +/// pointers. +fn bench_promote(c: &mut Criterion) { + let mut group = c.benchmark_group("linked_list/promote"); + group.throughput(Throughput::Elements(1)); + + group.bench_function("std_pop_back_push_front", |b| { + let mut list = std::collections::LinkedList::::new(); + for i in 0..BENCH_SIZE as u64 { + list.push_front(i); + } + b.iter(|| { + // Rotate: pop from back, push to front. + let value = list.pop_back().unwrap(); + list.push_front(value); + }); + }); + + group.bench_function("pingora_promote", |b| { + let mut list = LinkedList::with_capacity(BENCH_SIZE); + for i in 0..BENCH_SIZE as u64 { + list.push_head(i); + } + b.iter(|| { + let index = list.tail().unwrap(); + list.promote(index); + }); + }); + + group.finish(); +} + +// --------------------------------------------------------------------------- +// pop (drain the whole list) +// --------------------------------------------------------------------------- + +/// Measures the throughput of draining an entire list. +/// `iter_batched` with `LargeInput` tells Criterion that setup is +/// expensive: it will build a fresh list per batch rather than per +/// individual iteration, and report time-per-batch. +fn bench_pop(c: &mut Criterion) { + let mut group = c.benchmark_group("linked_list/pop"); + group.throughput(Throughput::Elements(POP_BATCH_SIZE as u64)); + + group.bench_function("std_pop_back", |b| { + b.iter_batched( + || { + let mut list = std::collections::LinkedList::::new(); + for i in 0..POP_BATCH_SIZE as u64 { + list.push_front(i); + } + list + }, + |mut list| { + for _ in 0..POP_BATCH_SIZE { + black_box(list.pop_back()); + } + }, + BatchSize::LargeInput, + ); + }); + + group.bench_function("pingora_pop_tail", |b| { + b.iter_batched( + || { + let mut list = LinkedList::with_capacity(POP_BATCH_SIZE); + for i in 0..POP_BATCH_SIZE as u64 { + list.push_head(i); + } + list + }, + |mut list| { + for _ in 0..POP_BATCH_SIZE { + black_box(list.pop_tail()); + } + }, + BatchSize::LargeInput, + ); + }); + + group.finish(); +} + +// --------------------------------------------------------------------------- +// Registry +// --------------------------------------------------------------------------- + +criterion_group!( + benches, + bench_push, + bench_iter, + bench_search, + bench_promote, + bench_pop, +); +criterion_main!(benches); diff --git a/pingora-lru/benches/bench_lru.rs b/pingora-lru/benches/bench_lru.rs index c0bdc776..0ad4d536 100644 --- a/pingora-lru/benches/bench_lru.rs +++ b/pingora-lru/benches/bench_lru.rs @@ -12,13 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; use rand::distributions::WeightedIndex; use rand::prelude::*; +use std::hint::black_box; use std::sync::Arc; use std::thread; use std::time::Instant; -// Non-uniform distributions, 100 items, 10 of them are 100x more likely to appear +// Non-uniform distribution: 100 items, 10 of them are 100x more likely to appear const WEIGHTS: &[usize] = &[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -26,123 +28,155 @@ const WEIGHTS: &[usize] = &[ 100, 100, 100, 100, 100, 100, 100, ]; -const ITERATIONS: usize = 5_000_000; const THREADS: usize = 8; -fn main() { - let lru = parking_lot::Mutex::new(lru::LruCache::::unbounded()); +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- - let plru = pingora_lru::Lru::<(), 10>::with_capacity(1000, 100); - // populate first, then we bench access/promotion +type StdLru = parking_lot::Mutex>; + +fn build_std_lru() -> StdLru { + let cache = parking_lot::Mutex::new(lru::LruCache::::unbounded()); for i in 0..WEIGHTS.len() { - lru.lock().put(i as u64, ()); + cache.lock().put(i as u64, ()); } + cache +} + +fn build_pingora_lru() -> pingora_lru::Lru<(), 10> { + let plru = pingora_lru::Lru::<(), 10>::with_capacity(1000, 100); for i in 0..WEIGHTS.len() { plru.admit(i as u64, (), 1); } + plru +} - // single thread - let mut rng = thread_rng(); +// --------------------------------------------------------------------------- +// Single-threaded +// --------------------------------------------------------------------------- + +/// Measures the cost of a single cache access/promotion. +/// Uses a non-uniform (Zipf-like) key distribution matching the original +/// benchmark: 10 out of 100 keys are 100× more likely to be sampled. +fn bench_single_thread(c: &mut Criterion) { + let std_lru = build_std_lru(); + let plru = build_pingora_lru(); let dist = WeightedIndex::new(WEIGHTS).unwrap(); + let mut rng = thread_rng(); - let before = Instant::now(); - for _ in 0..ITERATIONS { - lru.lock().get(&(dist.sample(&mut rng) as u64)); - } - let elapsed = before.elapsed(); - println!( - "lru promote total {elapsed:?}, {:?} avg per operation", - elapsed / ITERATIONS as u32 - ); - - let before = Instant::now(); - for _ in 0..ITERATIONS { - plru.promote(dist.sample(&mut rng) as u64); - } - let elapsed = before.elapsed(); - println!( - "pingora lru promote total {elapsed:?}, {:?} avg per operation", - elapsed / ITERATIONS as u32 - ); - - let before = Instant::now(); - for _ in 0..ITERATIONS { - plru.promote_top_n(dist.sample(&mut rng) as u64, 10); - } - let elapsed = before.elapsed(); - println!( - "pingora lru promote_top_10 total {elapsed:?}, {:?} avg per operation", - elapsed / ITERATIONS as u32 - ); - - // concurrent - - let lru = Arc::new(lru); - let mut handlers = vec![]; - for i in 0..THREADS { - let lru = lru.clone(); - let handler = thread::spawn(move || { - let mut rng = thread_rng(); - let dist = WeightedIndex::new(WEIGHTS).unwrap(); - let before = Instant::now(); - for _ in 0..ITERATIONS { - lru.lock().get(&(dist.sample(&mut rng) as u64)); + let mut group = c.benchmark_group("lru/single_thread"); + group.throughput(Throughput::Elements(1)); + + group.bench_function("std_lru_get", |b| { + b.iter(|| { + let key = dist.sample(&mut rng) as u64; + black_box(std_lru.lock().get(&key).is_some()); + }); + }); + + group.bench_function("pingora_lru_promote", |b| { + b.iter(|| { + plru.promote(black_box(dist.sample(&mut rng) as u64)); + }); + }); + + group.bench_function("pingora_lru_promote_top_10", |b| { + b.iter(|| { + plru.promote_top_n(black_box(dist.sample(&mut rng) as u64), 10); + }); + }); + + group.finish(); +} + +// --------------------------------------------------------------------------- +// Concurrent (8 threads) +// --------------------------------------------------------------------------- + +/// Measures throughput under contention with THREADS concurrent workers. +/// +/// `iter_custom` is used because Criterion does not natively manage threads. +/// We time the wall-clock duration of all threads completing `iters` +/// operations each, which closely mirrors the original manual benchmark. +fn bench_concurrent(c: &mut Criterion) { + let mut group = c.benchmark_group("lru/concurrent"); + // Report throughput as total operations across all threads. + group.throughput(Throughput::Elements(THREADS as u64)); + + group.bench_function("std_lru_get", |b| { + let lru = Arc::new(build_std_lru()); + b.iter_custom(|iters| { + let mut handles = Vec::with_capacity(THREADS); + let start = Instant::now(); + for _ in 0..THREADS { + let lru = Arc::clone(&lru); + handles.push(thread::spawn(move || { + let mut rng = thread_rng(); + let dist = WeightedIndex::new(WEIGHTS).unwrap(); + for _ in 0..iters { + let key = dist.sample(&mut rng) as u64; + black_box(lru.lock().get(&key).is_some()); + } + })); } - let elapsed = before.elapsed(); - println!( - "lru promote total {elapsed:?}, {:?} avg per operation thread {i}", - elapsed / ITERATIONS as u32 - ); + for h in handles { + h.join().unwrap(); + } + start.elapsed() }); - handlers.push(handler); - } - for thread in handlers { - thread.join().unwrap(); - } + }); - let plru = Arc::new(plru); - - let mut handlers = vec![]; - for i in 0..THREADS { - let plru = plru.clone(); - let handler = thread::spawn(move || { - let mut rng = thread_rng(); - let dist = WeightedIndex::new(WEIGHTS).unwrap(); - let before = Instant::now(); - for _ in 0..ITERATIONS { - plru.promote(dist.sample(&mut rng) as u64); + group.bench_function("pingora_lru_promote", |b| { + let plru = Arc::new(build_pingora_lru()); + b.iter_custom(|iters| { + let mut handles = Vec::with_capacity(THREADS); + let start = Instant::now(); + for _ in 0..THREADS { + let plru = Arc::clone(&plru); + handles.push(thread::spawn(move || { + let mut rng = thread_rng(); + let dist = WeightedIndex::new(WEIGHTS).unwrap(); + for _ in 0..iters { + plru.promote(black_box(dist.sample(&mut rng) as u64)); + } + })); + } + for h in handles { + h.join().unwrap(); } - let elapsed = before.elapsed(); - println!( - "pingora lru promote total {elapsed:?}, {:?} avg per operation thread {i}", - elapsed / ITERATIONS as u32 - ); + start.elapsed() }); - handlers.push(handler); - } - for thread in handlers { - thread.join().unwrap(); - } + }); - let mut handlers = vec![]; - for i in 0..THREADS { - let plru = plru.clone(); - let handler = thread::spawn(move || { - let mut rng = thread_rng(); - let dist = WeightedIndex::new(WEIGHTS).unwrap(); - let before = Instant::now(); - for _ in 0..ITERATIONS { - plru.promote_top_n(dist.sample(&mut rng) as u64, 10); + group.bench_function("pingora_lru_promote_top_10", |b| { + let plru = Arc::new(build_pingora_lru()); + b.iter_custom(|iters| { + let mut handles = Vec::with_capacity(THREADS); + let start = Instant::now(); + for _ in 0..THREADS { + let plru = Arc::clone(&plru); + handles.push(thread::spawn(move || { + let mut rng = thread_rng(); + let dist = WeightedIndex::new(WEIGHTS).unwrap(); + for _ in 0..iters { + plru.promote_top_n(black_box(dist.sample(&mut rng) as u64), 10); + } + })); } - let elapsed = before.elapsed(); - println!( - "pingora lru promote_top_10 total {elapsed:?}, {:?} avg per operation thread {i}", - elapsed / ITERATIONS as u32 - ); + for h in handles { + h.join().unwrap(); + } + start.elapsed() }); - handlers.push(handler); - } - for thread in handlers { - thread.join().unwrap(); - } + }); + + group.finish(); } + +// --------------------------------------------------------------------------- +// Registry +// --------------------------------------------------------------------------- + +criterion_group!(benches, bench_single_thread, bench_concurrent); +criterion_main!(benches);