diff --git a/pingora-lru/Cargo.toml b/pingora-lru/Cargo.toml
index 3eae82b9..c5ce8bcd 100644
--- a/pingora-lru/Cargo.toml
+++ b/pingora-lru/Cargo.toml
@@ -24,6 +24,7 @@ rand = "0.8"
 
 [dev-dependencies]
 lru = { workspace = true }
+criterion = { version = "0.5", features = ["html_reports"] }
 
 [[bench]]
 name = "bench_linked_list"
diff --git a/pingora-lru/benches/bench_linked_list.rs b/pingora-lru/benches/bench_linked_list.rs
index 5fc0e50a..8af84a46 100644
--- a/pingora-lru/benches/bench_linked_list.rs
+++ b/pingora-lru/benches/bench_linked_list.rs
@@ -12,133 +12,225 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::time::Instant;
-
-fn main() {
-    const ITEMS: usize = 5_000_000;
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion, Throughput};
+use pingora_lru::linked_list::LinkedList;
+use std::hint::black_box;
+
+/// Number of items pre-loaded into the list for benchmarks that measure
+/// a single operation against an existing list (promote, search, iter).
+const BENCH_SIZE: usize = 1_000_000;
+
+/// Smaller size used for bulk-pop benchmarks so each Criterion iteration
+/// completes in a reasonable time while still being representative.
+const POP_BATCH_SIZE: usize = 100_000;
+
+// ---------------------------------------------------------------------------
+// push
+// ---------------------------------------------------------------------------
+
+/// Measures the cost of a single push onto a growing list.
+/// Both lists start empty and grow without bound across iterations —
+/// this is intentional: we want to capture steady-state push cost, not
+/// amortised allocation cost.
+fn bench_push(c: &mut Criterion) {
+    let mut group = c.benchmark_group("linked_list/push");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("std_push_front", |b| {
+        let mut list = std::collections::LinkedList::<u64>::new();
+        b.iter(|| {
+            list.push_front(black_box(42u64));
+        });
+    });
+
+    group.bench_function("pingora_push_head", |b| {
+        let mut list = LinkedList::with_capacity(BENCH_SIZE);
+        b.iter(|| {
+            list.push_head(black_box(42u64));
+        });
+    });
+
+    group.finish();
+}
 
-    // push bench
+// ---------------------------------------------------------------------------
+// iter
+// ---------------------------------------------------------------------------
 
+/// Measures a complete traversal of a pre-built list.
+/// The list is constructed once and reused across all Criterion iterations.
+fn bench_iter(c: &mut Criterion) {
     let mut std_list = std::collections::LinkedList::<u64>::new();
-    let before = Instant::now();
-    for _ in 0..ITEMS {
-        std_list.push_front(0);
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "std linked list push_front total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITEMS as u32
-    );
-
-    let mut list = pingora_lru::linked_list::LinkedList::with_capacity(ITEMS);
-    let before = Instant::now();
-    for _ in 0..ITEMS {
-        list.push_head(0);
+    let mut pingora_list = LinkedList::with_capacity(BENCH_SIZE);
+    for i in 0..BENCH_SIZE as u64 {
+        std_list.push_front(i);
+        pingora_list.push_head(i);
     }
-    let elapsed = before.elapsed();
-    println!(
-        "pingora linked list push_head total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITEMS as u32
-    );
-
-    // iter bench
-
-    let mut count = 0;
-    let before = Instant::now();
-    for _ in std_list.iter() {
-        count += 1;
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "std linked list iter total {count} {elapsed:?}, {:?} avg per operation",
-        elapsed / count as u32
-    );
-
-    let mut count = 0;
-    let before = Instant::now();
-    for _ in list.iter() {
-        count += 1;
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "pingora linked list iter total {count} {elapsed:?}, {:?} avg per operation",
-        elapsed / count as u32
-    );
 
-    // search bench
+    let mut group = c.benchmark_group("linked_list/iter");
+    group.throughput(Throughput::Elements(BENCH_SIZE as u64));
+
+    group.bench_function("std_iter", |b| {
+        b.iter(|| {
+            // Fold to prevent the compiler from eliding the iteration.
+            let sum: u64 = std_list.iter().fold(0u64, |acc, v| acc.wrapping_add(*v));
+            black_box(sum);
+        });
+    });
+
+    group.bench_function("pingora_iter", |b| {
+        b.iter(|| {
+            let sum: u64 = pingora_list
+                .iter()
+                .fold(0u64, |acc, v| acc.wrapping_add(*v));
+            black_box(sum);
+        });
+    });
+
+    group.finish();
+}
 
-    let before = Instant::now();
-    for _ in 0..ITEMS {
-        assert!(!std_list.iter().take(10).any(|v| *v == 1));
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "std linked search first 10 items total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITEMS as u32
-    );
-
-    let before = Instant::now();
-    for _ in 0..ITEMS {
-        assert!(!list.iter().take(10).any(|v| *v == 1));
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "pingora linked search first 10 items total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITEMS as u32
-    );
-
-    let before = Instant::now();
-    for _ in 0..ITEMS {
-        assert!(!list.exist_near_head(1, 10));
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "pingora linked optimized search first 10 items total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITEMS as u32
-    );
-
-    // move node bench
-    let before = Instant::now();
-    for _ in 0..ITEMS {
-        let value = std_list.pop_back().unwrap();
-        std_list.push_front(value);
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "std linked list move back to front total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITEMS as u32
-    );
-
-    let before = Instant::now();
-    for _ in 0..ITEMS {
-        let index = list.tail().unwrap();
-        list.promote(index);
+// ---------------------------------------------------------------------------
+// search (first 10 elements)
+// ---------------------------------------------------------------------------
+
+/// Compares three ways to check whether a value exists within the first
+/// 10 nodes of the list: std linear scan, pingora linear scan, and
+/// pingora's optimised `exist_near_head` intrinsic.
+fn bench_search(c: &mut Criterion) {
+    let mut std_list = std::collections::LinkedList::<u64>::new();
+    let mut pingora_list = LinkedList::with_capacity(BENCH_SIZE);
+    // Value 1 is not near the head (head holds BENCH_SIZE-1), so every
+    // search terminates after 10 comparisons — the worst case for the
+    // "first 10" check.
+    for i in 0..BENCH_SIZE as u64 {
+        std_list.push_front(i);
+        pingora_list.push_head(i);
     }
-    let elapsed = before.elapsed();
-    println!(
-        "pingora linked list move tail to head total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITEMS as u32
-    );
 
-    // pop bench
+    let mut group = c.benchmark_group("linked_list/search_first_10");
+    group.throughput(Throughput::Elements(1));
 
-    let before = Instant::now();
-    for _ in 0..ITEMS {
-        std_list.pop_back();
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "std linked list pop_back {elapsed:?}, {:?} avg per operation",
-        elapsed / ITEMS as u32
-    );
-
-    let before = Instant::now();
-    for _ in 0..ITEMS {
-        list.pop_tail();
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "pingora linked list pop_tail total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITEMS as u32
-    );
+    group.bench_function("std_linear_search", |b| {
+        b.iter(|| {
+            black_box(std_list.iter().take(10).any(|v| *v == 1));
+        });
+    });
+
+    group.bench_function("pingora_linear_search", |b| {
+        b.iter(|| {
+            black_box(pingora_list.iter().take(10).any(|v| *v == 1));
+        });
+    });
+
+    group.bench_function("pingora_exist_near_head", |b| {
+        b.iter(|| {
+            black_box(pingora_list.exist_near_head(1, 10));
+        });
+    });
+
+    group.finish();
 }
+
+// ---------------------------------------------------------------------------
+// promote (move tail to head)
+// ---------------------------------------------------------------------------
+
+/// Measures a single tail→head move on a stable-sized list.
+/// The std implementation has no dedicated promote: it does pop_back +
+/// push_front. Pingora has a zero-copy `promote` that only rewires
+/// pointers.
+fn bench_promote(c: &mut Criterion) {
+    let mut group = c.benchmark_group("linked_list/promote");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("std_pop_back_push_front", |b| {
+        let mut list = std::collections::LinkedList::<u64>::new();
+        for i in 0..BENCH_SIZE as u64 {
+            list.push_front(i);
+        }
+        b.iter(|| {
+            // Rotate: pop from back, push to front.
+            let value = list.pop_back().unwrap();
+            list.push_front(value);
+        });
+    });
+
+    group.bench_function("pingora_promote", |b| {
+        let mut list = LinkedList::with_capacity(BENCH_SIZE);
+        for i in 0..BENCH_SIZE as u64 {
+            list.push_head(i);
+        }
+        b.iter(|| {
+            let index = list.tail().unwrap();
+            list.promote(index);
+        });
+    });
+
+    group.finish();
+}
+
+// ---------------------------------------------------------------------------
+// pop (drain the whole list)
+// ---------------------------------------------------------------------------
+
+/// Measures the throughput of draining an entire list.
+/// `iter_batched` with `LargeInput` tells Criterion that setup is
+/// expensive: it will build a fresh list per batch rather than per
+/// individual iteration, and report time-per-batch.
+fn bench_pop(c: &mut Criterion) {
+    let mut group = c.benchmark_group("linked_list/pop");
+    group.throughput(Throughput::Elements(POP_BATCH_SIZE as u64));
+
+    group.bench_function("std_pop_back", |b| {
+        b.iter_batched(
+            || {
+                let mut list = std::collections::LinkedList::<u64>::new();
+                for i in 0..POP_BATCH_SIZE as u64 {
+                    list.push_front(i);
+                }
+                list
+            },
+            |mut list| {
+                for _ in 0..POP_BATCH_SIZE {
+                    black_box(list.pop_back());
+                }
+            },
+            BatchSize::LargeInput,
+        );
+    });
+
+    group.bench_function("pingora_pop_tail", |b| {
+        b.iter_batched(
+            || {
+                let mut list = LinkedList::with_capacity(POP_BATCH_SIZE);
+                for i in 0..POP_BATCH_SIZE as u64 {
+                    list.push_head(i);
+                }
+                list
+            },
+            |mut list| {
+                for _ in 0..POP_BATCH_SIZE {
+                    black_box(list.pop_tail());
+                }
+            },
+            BatchSize::LargeInput,
+        );
+    });
+
+    group.finish();
+}
+
+// ---------------------------------------------------------------------------
+// Registry
+// ---------------------------------------------------------------------------
+
+criterion_group!(
+    benches,
+    bench_push,
+    bench_iter,
+    bench_search,
+    bench_promote,
+    bench_pop,
+);
+criterion_main!(benches);
diff --git a/pingora-lru/benches/bench_lru.rs b/pingora-lru/benches/bench_lru.rs
index c0bdc776..0ad4d536 100644
--- a/pingora-lru/benches/bench_lru.rs
+++ b/pingora-lru/benches/bench_lru.rs
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 use rand::distributions::WeightedIndex;
 use rand::prelude::*;
+use std::hint::black_box;
 use std::sync::Arc;
 use std::thread;
 use std::time::Instant;
 
-// Non-uniform distributions, 100 items, 10 of them are 100x more likely to appear
+// Non-uniform distribution: 100 items, 10 of them are 100x more likely to appear
 const WEIGHTS: &[usize] = &[
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -26,123 +28,155 @@ const WEIGHTS: &[usize] = &[
     100, 100, 100, 100, 100, 100, 100,
 ];
 
-const ITERATIONS: usize = 5_000_000;
 const THREADS: usize = 8;
 
-fn main() {
-    let lru = parking_lot::Mutex::new(lru::LruCache::<u64, ()>::unbounded());
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
 
-    let plru = pingora_lru::Lru::<(), 10>::with_capacity(1000, 100);
-    // populate first, then we bench access/promotion
+type StdLru = parking_lot::Mutex<lru::LruCache<u64, ()>>;
+
+fn build_std_lru() -> StdLru {
+    let cache = parking_lot::Mutex::new(lru::LruCache::<u64, ()>::unbounded());
     for i in 0..WEIGHTS.len() {
-        lru.lock().put(i as u64, ());
+        cache.lock().put(i as u64, ());
     }
+    cache
+}
+
+fn build_pingora_lru() -> pingora_lru::Lru<(), 10> {
+    let plru = pingora_lru::Lru::<(), 10>::with_capacity(1000, 100);
     for i in 0..WEIGHTS.len() {
         plru.admit(i as u64, (), 1);
     }
+    plru
+}
 
-    // single thread
-    let mut rng = thread_rng();
+// ---------------------------------------------------------------------------
+// Single-threaded
+// ---------------------------------------------------------------------------
+
+/// Measures the cost of a single cache access/promotion.
+/// Uses a non-uniform (Zipf-like) key distribution matching the original
+/// benchmark: 10 out of 100 keys are 100× more likely to be sampled.
+fn bench_single_thread(c: &mut Criterion) {
+    let std_lru = build_std_lru();
+    let plru = build_pingora_lru();
     let dist = WeightedIndex::new(WEIGHTS).unwrap();
+    let mut rng = thread_rng();
 
-    let before = Instant::now();
-    for _ in 0..ITERATIONS {
-        lru.lock().get(&(dist.sample(&mut rng) as u64));
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "lru promote total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITERATIONS as u32
-    );
-
-    let before = Instant::now();
-    for _ in 0..ITERATIONS {
-        plru.promote(dist.sample(&mut rng) as u64);
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "pingora lru promote total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITERATIONS as u32
-    );
-
-    let before = Instant::now();
-    for _ in 0..ITERATIONS {
-        plru.promote_top_n(dist.sample(&mut rng) as u64, 10);
-    }
-    let elapsed = before.elapsed();
-    println!(
-        "pingora lru promote_top_10 total {elapsed:?}, {:?} avg per operation",
-        elapsed / ITERATIONS as u32
-    );
-
-    // concurrent
-
-    let lru = Arc::new(lru);
-    let mut handlers = vec![];
-    for i in 0..THREADS {
-        let lru = lru.clone();
-        let handler = thread::spawn(move || {
-            let mut rng = thread_rng();
-            let dist = WeightedIndex::new(WEIGHTS).unwrap();
-            let before = Instant::now();
-            for _ in 0..ITERATIONS {
-                lru.lock().get(&(dist.sample(&mut rng) as u64));
+    let mut group = c.benchmark_group("lru/single_thread");
+    group.throughput(Throughput::Elements(1));
+
+    group.bench_function("std_lru_get", |b| {
+        b.iter(|| {
+            let key = dist.sample(&mut rng) as u64;
+            black_box(std_lru.lock().get(&key).is_some());
+        });
+    });
+
+    group.bench_function("pingora_lru_promote", |b| {
+        b.iter(|| {
+            plru.promote(black_box(dist.sample(&mut rng) as u64));
+        });
+    });
+
+    group.bench_function("pingora_lru_promote_top_10", |b| {
+        b.iter(|| {
+            plru.promote_top_n(black_box(dist.sample(&mut rng) as u64), 10);
+        });
+    });
+
+    group.finish();
+}
+
+// ---------------------------------------------------------------------------
+// Concurrent (8 threads)
+// ---------------------------------------------------------------------------
+
+/// Measures throughput under contention with THREADS concurrent workers.
+///
+/// `iter_custom` is used because Criterion does not natively manage threads.
+/// We time the wall-clock duration of all threads completing `iters`
+/// operations each, which closely mirrors the original manual benchmark.
+fn bench_concurrent(c: &mut Criterion) {
+    let mut group = c.benchmark_group("lru/concurrent");
+    // Report throughput as total operations across all threads.
+    group.throughput(Throughput::Elements(THREADS as u64));
+
+    group.bench_function("std_lru_get", |b| {
+        let lru = Arc::new(build_std_lru());
+        b.iter_custom(|iters| {
+            let mut handles = Vec::with_capacity(THREADS);
+            let start = Instant::now();
+            for _ in 0..THREADS {
+                let lru = Arc::clone(&lru);
+                handles.push(thread::spawn(move || {
+                    let mut rng = thread_rng();
+                    let dist = WeightedIndex::new(WEIGHTS).unwrap();
+                    for _ in 0..iters {
+                        let key = dist.sample(&mut rng) as u64;
+                        black_box(lru.lock().get(&key).is_some());
+                    }
+                }));
             }
-            let elapsed = before.elapsed();
-            println!(
-                "lru promote total {elapsed:?}, {:?} avg per operation thread {i}",
-                elapsed / ITERATIONS as u32
-            );
+            for h in handles {
+                h.join().unwrap();
+            }
+            start.elapsed()
         });
-        handlers.push(handler);
-    }
-    for thread in handlers {
-        thread.join().unwrap();
-    }
+    });
 
-    let plru = Arc::new(plru);
-
-    let mut handlers = vec![];
-    for i in 0..THREADS {
-        let plru = plru.clone();
-        let handler = thread::spawn(move || {
-            let mut rng = thread_rng();
-            let dist = WeightedIndex::new(WEIGHTS).unwrap();
-            let before = Instant::now();
-            for _ in 0..ITERATIONS {
-                plru.promote(dist.sample(&mut rng) as u64);
+    group.bench_function("pingora_lru_promote", |b| {
+        let plru = Arc::new(build_pingora_lru());
+        b.iter_custom(|iters| {
+            let mut handles = Vec::with_capacity(THREADS);
+            let start = Instant::now();
+            for _ in 0..THREADS {
+                let plru = Arc::clone(&plru);
+                handles.push(thread::spawn(move || {
+                    let mut rng = thread_rng();
+                    let dist = WeightedIndex::new(WEIGHTS).unwrap();
+                    for _ in 0..iters {
+                        plru.promote(black_box(dist.sample(&mut rng) as u64));
+                    }
+                }));
+            }
+            for h in handles {
+                h.join().unwrap();
             }
-            let elapsed = before.elapsed();
-            println!(
-                "pingora lru promote total {elapsed:?}, {:?} avg per operation thread {i}",
-                elapsed / ITERATIONS as u32
-            );
+            start.elapsed()
         });
-        handlers.push(handler);
-    }
-    for thread in handlers {
-        thread.join().unwrap();
-    }
+    });
 
-    let mut handlers = vec![];
-    for i in 0..THREADS {
-        let plru = plru.clone();
-        let handler = thread::spawn(move || {
-            let mut rng = thread_rng();
-            let dist = WeightedIndex::new(WEIGHTS).unwrap();
-            let before = Instant::now();
-            for _ in 0..ITERATIONS {
-                plru.promote_top_n(dist.sample(&mut rng) as u64, 10);
+    group.bench_function("pingora_lru_promote_top_10", |b| {
+        let plru = Arc::new(build_pingora_lru());
+        b.iter_custom(|iters| {
+            let mut handles = Vec::with_capacity(THREADS);
+            let start = Instant::now();
+            for _ in 0..THREADS {
+                let plru = Arc::clone(&plru);
+                handles.push(thread::spawn(move || {
+                    let mut rng = thread_rng();
+                    let dist = WeightedIndex::new(WEIGHTS).unwrap();
+                    for _ in 0..iters {
+                        plru.promote_top_n(black_box(dist.sample(&mut rng) as u64), 10);
+                    }
+                }));
             }
-            let elapsed = before.elapsed();
-            println!(
-                "pingora lru promote_top_10 total {elapsed:?}, {:?} avg per operation thread {i}",
-                elapsed / ITERATIONS as u32
-            );
+            for h in handles {
+                h.join().unwrap();
+            }
+            start.elapsed()
         });
-        handlers.push(handler);
-    }
-    for thread in handlers {
-        thread.join().unwrap();
-    }
+    });
+
+    group.finish();
 }
+
+// ---------------------------------------------------------------------------
+// Registry
+// ---------------------------------------------------------------------------
+
+criterion_group!(benches, bench_single_thread, bench_concurrent);
+criterion_main!(benches);