diff --git a/src/snmalloc/backend_helpers/statsrange.h b/src/snmalloc/backend_helpers/statsrange.h
index d1e213777..8fe676fb6 100644
--- a/src/snmalloc/backend_helpers/statsrange.h
+++ b/src/snmalloc/backend_helpers/statsrange.h
@@ -16,8 +16,7 @@ namespace snmalloc
     {
       using ContainsParent<ParentRange>::parent;
 
-      static inline stl::Atomic<size_t> current_usage{};
-      static inline stl::Atomic<size_t> peak_usage{};
+      static inline Stat usage{};
 
     public:
       static constexpr bool Aligned = ParentRange::Aligned;
@@ -30,34 +29,26 @@ namespace snmalloc
 
       CapPtr<void, ChunkBounds> alloc_range(size_t size)
       {
-        auto result = parent.alloc_range(size);
-        if (result != nullptr)
-        {
-          auto prev = current_usage.fetch_add(size);
-          auto curr = peak_usage.load();
-          while (curr < prev + size)
-          {
-            if (peak_usage.compare_exchange_weak(curr, prev + size))
-              break;
-          }
-        }
-        return result;
+        auto r = parent.alloc_range(size);
+        if (r != nullptr)
+          usage += size;
+        return r;
       }
 
       void dealloc_range(CapPtr<void, ChunkBounds> base, size_t size)
       {
-        current_usage -= size;
+        usage -= size;
         parent.dealloc_range(base, size);
       }
 
       size_t get_current_usage()
       {
-        return current_usage.load();
+        return usage.get_curr();
       }
 
       size_t get_peak_usage()
       {
-        return peak_usage.load();
+        return usage.get_peak();
       }
     };
   };
diff --git a/src/snmalloc/ds_aal/ds_aal.h b/src/snmalloc/ds_aal/ds_aal.h
index 21eeb8dd6..e0b4ac202 100644
--- a/src/snmalloc/ds_aal/ds_aal.h
+++ b/src/snmalloc/ds_aal/ds_aal.h
@@ -7,4 +7,5 @@
 #include "../aal/aal.h"
 #include "flaglock.h"
 #include "prevent_fork.h"
+#include "seqset.h"
 #include "singleton.h"
\ No newline at end of file
diff --git a/src/snmalloc/ds_core/seqset.h b/src/snmalloc/ds_aal/seqset.h
similarity index 99%
rename from src/snmalloc/ds_core/seqset.h
rename to src/snmalloc/ds_aal/seqset.h
index 6046bca70..0ad18fb4d 100644
--- a/src/snmalloc/ds_core/seqset.h
+++ b/src/snmalloc/ds_aal/seqset.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include "../aal/aal.h"
 #include "../ds_core/ds_core.h"
 #include "snmalloc/stl/type_traits.h"
 #include "snmalloc/stl/utility.h"
diff --git a/src/snmalloc/ds_core/ds_core.h b/src/snmalloc/ds_core/ds_core.h
index 38e99dce2..2292b8118 100644
--- a/src/snmalloc/ds_core/ds_core.h
+++ b/src/snmalloc/ds_core/ds_core.h
@@ -15,5 +15,5 @@
 #include "mitigations.h"
 #include "ptrwrap.h"
 #include "redblacktree.h"
-#include "seqset.h"
-#include "tid.h"
\ No newline at end of file
+#include "stats.h"
+#include "tid.h"
diff --git a/src/snmalloc/ds_core/stats.h b/src/snmalloc/ds_core/stats.h
new file mode 100644
index 000000000..29d4c9a42
--- /dev/null
+++ b/src/snmalloc/ds_core/stats.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include "defines.h"
+#include "snmalloc/stl/atomic.h"
+#include "stddef.h"
+
+namespace snmalloc
+{
+  /**
+   * Very basic statistic that tracks current and peak values.
+   */
+  class Stat
+  {
+  private:
+    stl::Atomic<size_t> curr{0};
+    stl::Atomic<size_t> peak{0};
+
+  public:
+    void increase(size_t amount)
+    {
+      size_t old = curr.fetch_add(amount);
+      size_t c = old + amount;
+      size_t p = peak.load(stl::memory_order_relaxed);
+      while (c > p)
+      {
+        if (peak.compare_exchange_strong(p, c))
+          break;
+      }
+    }
+
+    void decrease(size_t amount)
+    {
+      size_t prev = curr.fetch_sub(amount);
+      SNMALLOC_ASSERT_MSG(
+        prev >= amount, "prev = {}, amount = {}", prev, amount);
+      UNUSED(prev);
+    }
+
+    size_t get_curr()
+    {
+      return curr.load(stl::memory_order_relaxed);
+    }
+
+    size_t get_peak()
+    {
+      return peak.load(stl::memory_order_relaxed);
+    }
+
+    void operator+=(size_t amount)
+    {
+      increase(amount);
+    }
+
+    void operator-=(size_t amount)
+    {
+      decrease(amount);
+    }
+
+    void operator++()
+    {
+      increase(1);
+    }
+
+    void operator--()
+    {
+      decrease(1);
+    }
+  };
+
+  /**
+   * Very basic statistic that can only grow.  Not thread-safe.
+   */
+  class MonotoneLocalStat
+  {
+    stl::Atomic<size_t> value{0};
+
+  public:
+    void operator++(int)
+    {
+      auto old = value.load(stl::memory_order_relaxed);
+      value.store(old + 1, stl::memory_order_relaxed);
+    }
+
+    void operator+=(const MonotoneLocalStat& other)
+    {
+      auto v = other.value.load(stl::memory_order_relaxed);
+      value.fetch_add(v, stl::memory_order_relaxed);
+    }
+
+    void operator+=(size_t v)
+    {
+      auto old = value.load(stl::memory_order_relaxed);
+      value.store(old + v, stl::memory_order_relaxed);
+    }
+
+    size_t operator*()
+    {
+      return value.load(stl::memory_order_relaxed);
+    }
+  };
+} // namespace snmalloc
diff --git a/src/snmalloc/global/globalalloc.h b/src/snmalloc/global/globalalloc.h
index fa769e3e4..e9c9ccc7b 100644
--- a/src/snmalloc/global/globalalloc.h
+++ b/src/snmalloc/global/globalalloc.h
@@ -84,6 +84,15 @@ namespace snmalloc
       }
     }
 
+    if (
+      result == nullptr &&
+      RemoteDeallocCache<Config_>::remote_inflight.get_curr() != 0)
+    {
+      report_fatal_error(
+        "debug_check_empty: remote inflight deallocations left {}}",
+        RemoteDeallocCache<Config_>::remote_inflight.get_curr());
+    }
+
     if (result != nullptr)
     {
       *result = okay;
@@ -128,6 +137,81 @@ namespace snmalloc
     }
   }
 
+  template<SNMALLOC_CONCEPT(IsConfig) Config_ = Config>
+  inline static void get_stats(AllocStats& stats)
+  {
+    auto alloc = AllocPool<Config>::iterate();
+    while (alloc != nullptr)
+    {
+      stats += alloc->get_stats();
+      alloc = AllocPool<Config>::iterate(alloc);
+    }
+  }
+
+  template<SNMALLOC_CONCEPT(IsConfig) Config_ = Config>
+  inline static void print_alloc_stats()
+  {
+    static stl::Atomic<size_t> dump{0};
+
+    auto l_dump = dump++;
+    if (l_dump == 0)
+    {
+      message<1024>(
+        "snmalloc_allocs,dumpid,sizeclass,size,allocated,deallocated,in_use,"
+        "bytes,slabs allocated,slabs deallocated,slabs in_use,slabs bytes");
+      message<1024>(
+        "snmalloc_totals,dumpid,backend bytes,peak backend "
+        "bytes,requested,slabs requested bytes,remote inflight bytes,allocator "
+        "count");
+    }
+
+    AllocStats stats;
+    snmalloc::get_stats<Config>(stats);
+    size_t total_live{0};
+    size_t total_live_slabs{0};
+    for (size_t i = 0; i < snmalloc::SIZECLASS_REP_SIZE; i++)
+    {
+      auto sc = snmalloc::sizeclass_t::from_raw(i);
+      auto allocated = *stats[sc].objects_allocated;
+      auto deallocated = *stats[sc].objects_deallocated;
+      auto slabs_allocated = *stats[sc].slabs_allocated;
+      auto slabs_deallocated = *stats[sc].slabs_deallocated;
+      if (allocated == 0 && deallocated == 0)
+        continue;
+      auto size = snmalloc::sizeclass_full_to_size(sc);
+      auto slab_size = snmalloc::sizeclass_full_to_slab_size(sc);
+      auto in_use = allocated - deallocated;
+      auto amount = in_use * size;
+      total_live += amount;
+      auto in_use_slabs = slabs_allocated - slabs_deallocated;
+      auto amount_slabs = in_use_slabs * slab_size;
+      total_live_slabs += amount_slabs;
+
+      snmalloc::message<1024>(
+        "snmalloc_allocs,{},{},{},{},{},{},{},{},{},{},{}",
+        l_dump,
+        i,
+        size,
+        allocated,
+        deallocated,
+        in_use,
+        amount,
+        slabs_allocated,
+        slabs_deallocated,
+        in_use_slabs,
+        amount_slabs);
+    }
+    snmalloc::message<1024>(
+      "snmalloc_totals,{},{},{},{},{},{},{}",
+      l_dump,
+      Config::Backend::get_current_usage(),
+      Config::Backend::get_peak_usage(),
+      total_live,
+      total_live_slabs,
+      RemoteDeallocCache<Config>::remote_inflight.get_curr(),
+      Config::pool().get_count());
+  }
+
   /**
    * Returns the number of remaining bytes in an object.
    *
diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/alloc.h
similarity index 94%
rename from src/snmalloc/mem/corealloc.h
rename to src/snmalloc/mem/alloc.h
index 1b7f7f5b5..d2ef84113 100644
--- a/src/snmalloc/mem/corealloc.h
+++ b/src/snmalloc/mem/alloc.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "../ds/ds.h"
+#include "allocstats.h"
 #include "check_init.h"
 #include "freelist.h"
 #include "metadata.h"
@@ -32,7 +33,7 @@ namespace snmalloc
   }
 
   template<ZeroMem zero_mem, typename Config>
-  inline static SNMALLOC_FAST_PATH capptr::Alloc<void>
+  inline static SNMALLOC_FAST_PATH void*
   finish_alloc(freelist::HeadPtr p, smallsizeclass_t sizeclass)
   {
     auto r = finish_alloc_no_zero(p, sizeclass);
@@ -43,7 +44,7 @@ namespace snmalloc
     // TODO: Should this be zeroing the free Object state, in the non-zeroing
     // case?
 
-    return r;
+    return capptr_reveal(r);
   }
 
   struct FastFreeLists
@@ -156,6 +157,11 @@ namespace snmalloc
      */
     Ticker<typename Config::Pal> ticker;
 
+    /**
+     * Tracks this allocators memory usage
+     */
+    AllocStats stats;
+
     /**
      * The message queue needs to be accessible from other threads
      *
@@ -437,6 +443,9 @@ namespace snmalloc
         post();
       }
 
+      // Push size to global statistics
+      RemoteDeallocCache<Config>::remote_inflight -= bytes_freed;
+
       return action(args...);
     }
 
@@ -488,16 +497,19 @@ namespace snmalloc
         freelist::Object::key_root,
         entry.get_slab_metadata()->as_key_tweak(),
         domesticate);
-      if (!need_post && !remote_dealloc_cache.reserve_space(entry, nelem))
-      {
-        need_post = true;
-      }
+
+      // Need to account for forwarded bytes.
+      size_t size = nelem * sizeclass_full_to_size(entry.get_sizeclass());
+      bytes_returned += size;
+
+      need_post |= remote_dealloc_cache.reserve_space(entry, nelem);
+
       remote_dealloc_cache.template forward<sizeof(Allocator)>(
         entry.get_remote()->trunc_id(), msg);
     }
 
     template<typename Domesticator>
-    SNMALLOC_FAST_PATH static auto dealloc_local_objects_fast(
+    SNMALLOC_FAST_PATH auto dealloc_local_objects_fast(
       capptr::Alloc<RemoteMessage> msg,
       const PagemapEntry& entry,
       BackendSlabMetadata* meta,
@@ -523,6 +535,9 @@ namespace snmalloc
 
       bytes_freed += objsize * length;
 
+      stats[entry.get_sizeclass()].objects_deallocated +=
+        static_cast<size_t>(length);
+
       // Update the head and the next pointer in the free list.
       meta->free_queue.append_segment(
         curr,
@@ -546,7 +561,7 @@ namespace snmalloc
      *  - alloc(size_t)
      *    - small_alloc(size_t)
      *      - gets allocation from a fast free list and is done.
-     *      - if no fast free list,
+     *      - otherwise no fast free list and calls small_alloc_slow
      *         - check for message queue
      *         - small_refill(size_t)
      *           - If another free list is available, use it.
@@ -583,17 +598,17 @@ namespace snmalloc
       {
         // Small allocations are more likely. Improve
         // branch prediction by placing this case first.
-        return capptr_reveal(small_alloc<zero_mem, CheckInit>(size));
+        return small_alloc<zero_mem, CheckInit>(size);
       }
 
-      return capptr_reveal(alloc_not_small<zero_mem, CheckInit>(size, this));
+      return alloc_not_small<zero_mem, CheckInit>(size, this);
     }
 
     /**
      * Fast allocation for small objects.
      */
     template<ZeroMem zero_mem, typename CheckInit>
-    SNMALLOC_FAST_PATH capptr::Alloc<void> small_alloc(size_t size)
+    SNMALLOC_FAST_PATH void* small_alloc(size_t size)
     {
       auto domesticate =
         [this](freelist::QueuePtr p) SNMALLOC_FAST_PATH_LAMBDA {
@@ -606,12 +621,20 @@ namespace snmalloc
       if (SNMALLOC_LIKELY(!fl->empty()))
       {
         auto p = fl->take(key, domesticate);
+        stats[sizeclass].objects_allocated++;
         return finish_alloc<zero_mem, Config>(p, sizeclass);
       }
 
+      return small_alloc_slow<zero_mem, CheckInit>(sizeclass, fl);
+    }
+
+    template<ZeroMem zero_mem, typename CheckInit>
+    SNMALLOC_SLOW_PATH void*
+    small_alloc_slow(smallsizeclass_t sizeclass, freelist::Iter<>* fl)
+    {
       return handle_message_queue(
         [](Allocator* alloc, smallsizeclass_t sizeclass, freelist::Iter<>* fl)
-          -> capptr::Alloc<void> {
+          -> void* {
           return alloc->small_refill<zero_mem, CheckInit>(sizeclass, *fl);
         },
         this,
@@ -629,7 +652,7 @@ namespace snmalloc
      * register.
      */
     template<ZeroMem zero_mem, typename CheckInit>
-    static SNMALLOC_SLOW_PATH capptr::Alloc<void>
+    static SNMALLOC_SLOW_PATH void*
     alloc_not_small(size_t size, Allocator* self)
     {
       if (size == 0)
@@ -641,15 +664,15 @@ namespace snmalloc
       }
 
       return self->handle_message_queue(
-        [](Allocator* self, size_t size) -> capptr::Alloc<void> {
+        [](Allocator* self, size_t size) -> void* {
           return CheckInit::check_init(
-            [self, size]() {
+            [self, size]() -> void* {
               if (size > bits::one_at_bit(bits::BITS - 1))
               {
                 // Cannot allocate something that is more that half the size of
                 // the address space
                 errno = ENOMEM;
-                return capptr::Alloc<void>{nullptr};
+                return nullptr;
               }
 
               // Check if secondary allocator wants to offer the memory
@@ -661,7 +684,7 @@ namespace snmalloc
               {
                 if constexpr (zero_mem == YesZero)
                   Config::Pal::zero(result, size);
-                return capptr::Alloc<void>::unsafe_from(result);
+                return result;
               }
 
               // Grab slab of correct size
@@ -694,10 +717,17 @@ namespace snmalloc
                   chunk.unsafe_ptr(), bits::next_pow2(size));
               }
 
-              return capptr_chunk_is_alloc(
-                capptr_to_user_address_control(chunk));
+              if (chunk.unsafe_ptr() != nullptr)
+              {
+                auto sc = size_to_sizeclass_full(size);
+                self->stats[sc].objects_allocated++;
+                self->stats[sc].slabs_allocated++;
+              }
+
+              return capptr_reveal(
+                capptr_chunk_is_alloc(capptr_to_user_address_control(chunk)));
             },
-            [](Allocator* a, size_t size) {
+            [](Allocator* a, size_t size) -> void* {
               return alloc_not_small<zero_mem, CheckInitNoOp>(size, a);
             },
             size);
@@ -707,7 +737,7 @@ namespace snmalloc
     }
 
     template<ZeroMem zero_mem, typename CheckInit>
-    SNMALLOC_FAST_PATH capptr::Alloc<void>
+    SNMALLOC_FAST_PATH void*
     small_refill(smallsizeclass_t sizeclass, freelist::Iter<>& fast_free_list)
     {
       void* result = SecondaryAllocator::allocate(
@@ -727,10 +757,8 @@ namespace snmalloc
         // deallocated, before snmalloc is initialised, then it will fail
         // to access the pagemap.
         return CheckInit::check_init(
-          [result]() { return capptr::Alloc<void>::unsafe_from(result); },
-          [](Allocator*, void* result) {
-            return capptr::Alloc<void>::unsafe_from(result);
-          },
+          [result]() { return result; },
+          [](Allocator*, void* result) { return result; },
           result);
       }
 
@@ -773,6 +801,7 @@ namespace snmalloc
           laden.insert(meta);
         }
 
+        stats[sizeclass].objects_allocated++;
         auto r = finish_alloc<zero_mem, Config>(p, sizeclass);
         return ticker.check_tick(r);
       }
@@ -780,11 +809,11 @@ namespace snmalloc
     }
 
     template<ZeroMem zero_mem, typename CheckInit>
-    SNMALLOC_SLOW_PATH capptr::Alloc<void> small_refill_slow(
+    SNMALLOC_SLOW_PATH void* small_refill_slow(
       smallsizeclass_t sizeclass, freelist::Iter<>& fast_free_list)
     {
       return CheckInit::check_init(
-        [this, sizeclass, &fast_free_list]() -> capptr::Alloc<void> {
+        [this, sizeclass, &fast_free_list]() -> void* {
           size_t rsize = sizeclass_to_size(sizeclass);
 
           // No existing free list get a new slab.
@@ -831,6 +860,9 @@ namespace snmalloc
             laden.insert(meta);
           }
 
+          stats[sizeclass].slabs_allocated++;
+          stats[sizeclass].objects_allocated++;
+
           auto r = finish_alloc<zero_mem, Config>(p, sizeclass);
           return ticker.check_tick(r);
         },
@@ -1006,6 +1038,7 @@ namespace snmalloc
        */
       if (SNMALLOC_LIKELY(public_state() == entry.get_remote()))
       {
+        stats[entry.get_sizeclass()].objects_deallocated++;
         dealloc_cheri_checks(p_tame.unsafe_ptr());
         dealloc_local_object(p_tame, entry);
         return;
@@ -1074,6 +1107,8 @@ namespace snmalloc
         // Remove from set of fully used slabs.
         meta->node.remove();
 
+        stats[entry.get_sizeclass()].slabs_deallocated++;
+
         Config::Backend::dealloc_chunk(
           get_backend_local_state(), *meta, p, size, entry.get_sizeclass());
 
@@ -1170,6 +1205,8 @@ namespace snmalloc
         // don't touch the cache lines at this point in snmalloc_check_client.
         auto start = clear_slab(meta, sizeclass);
 
+        stats[sizeclass].slabs_deallocated++;
+
         Config::Backend::dealloc_chunk(
           get_backend_local_state(),
           *meta,
@@ -1336,10 +1373,10 @@ namespace snmalloc
                              return capptr_domesticate<Config>(local_state, p);
                            };
 
-      size_t bytes_flushed = 0; // Not currently used.
-
       if (destroy_queue)
       {
+        size_t bytes_flushed = 0;
+
         auto cb =
           [this, domesticate, &bytes_flushed](capptr::Alloc<RemoteMessage> m) {
             bool need_post = true; // Always going to post, so ignore.
@@ -1350,6 +1387,8 @@ namespace snmalloc
           };
 
         message_queue().destroy_and_iterate(domesticate, cb);
+
+        RemoteDeallocCache<Config>::remote_inflight -= bytes_flushed;
       }
       else
       {
@@ -1397,8 +1436,9 @@ namespace snmalloc
           }
         });
 
-      // Set the remote_dealloc_cache to immediately slow path.
-      remote_dealloc_cache.capacity = 0;
+      // TODO: I don't think this is needed.
+      // // Set the remote_dealloc_cache to immediately slow path.
+      // remote_dealloc_cache.cache_bytes = REMOTE_CACHE;
 
       return posted;
     }
@@ -1467,6 +1507,11 @@ namespace snmalloc
 #endif
       return sent_something;
     }
+
+    const AllocStats& get_stats()
+    {
+      return stats;
+    }
   };
 
   template<typename Config>
diff --git a/src/snmalloc/mem/allocstats.h b/src/snmalloc/mem/allocstats.h
new file mode 100644
index 000000000..bfa789c36
--- /dev/null
+++ b/src/snmalloc/mem/allocstats.h
@@ -0,0 +1,44 @@
+#include "../ds_core/ds_core.h"
+#include "sizeclasstable.h"
+
+#include <array>
+
+namespace snmalloc
+{
+  struct AllocStat
+  {
+    MonotoneLocalStat objects_allocated{};
+    MonotoneLocalStat objects_deallocated{};
+    MonotoneLocalStat slabs_allocated{};
+    MonotoneLocalStat slabs_deallocated{};
+  };
+
+  class AllocStats
+  {
+    std::array<AllocStat, SIZECLASS_REP_SIZE> sizeclass{};
+
+  public:
+    AllocStat& operator[](sizeclass_t index)
+    {
+      auto i = index.raw();
+      return sizeclass[i];
+    }
+
+    AllocStat& operator[](smallsizeclass_t index)
+    {
+      return sizeclass[sizeclass_t::from_small_class(index).raw()];
+    }
+
+    void operator+=(const AllocStats& other)
+    {
+      for (size_t i = 0; i < SIZECLASS_REP_SIZE; i++)
+      {
+        sizeclass[i].objects_allocated += other.sizeclass[i].objects_allocated;
+        sizeclass[i].objects_deallocated +=
+          other.sizeclass[i].objects_deallocated;
+        sizeclass[i].slabs_allocated += other.sizeclass[i].slabs_allocated;
+        sizeclass[i].slabs_deallocated += other.sizeclass[i].slabs_deallocated;
+      }
+    }
+  };
+} // namespace snmalloc
\ No newline at end of file
diff --git a/src/snmalloc/mem/mem.h b/src/snmalloc/mem/mem.h
index fc5e59965..e9c80765c 100644
--- a/src/snmalloc/mem/mem.h
+++ b/src/snmalloc/mem/mem.h
@@ -1,7 +1,7 @@
+#include "alloc.h"
 #include "backend_concept.h"
 #include "backend_wrappers.h"
 #include "check_init.h"
-#include "corealloc.h"
 #include "entropy.h"
 #include "freelist.h"
 #include "metadata.h"
diff --git a/src/snmalloc/mem/pool.h b/src/snmalloc/mem/pool.h
index 9b6294d67..6bce43f06 100644
--- a/src/snmalloc/mem/pool.h
+++ b/src/snmalloc/mem/pool.h
@@ -32,9 +32,15 @@ namespace snmalloc
 
     FlagWord lock{};
     capptr::Alloc<T> list{nullptr};
+    stl::Atomic<size_t> count{0};
 
   public:
     constexpr PoolState() = default;
+
+    size_t get_count()
+    {
+      return count.load(stl::memory_order_relaxed);
+    }
   };
 
   /**
@@ -81,7 +87,7 @@ namespace snmalloc
    * The third template argument is a method to retrieve the actual PoolState.
    *
    * For the pool of allocators, refer to the AllocPool alias defined in
-   * corealloc.h.
+   * alloc.h.
    *
    * For a pool of another type, it is recommended to leave the
    * third template argument with its default value. The SingletonPoolState
@@ -124,6 +130,8 @@ namespace snmalloc
         p->list_next = pool.list;
         pool.list = p;
 
+        pool.count++;
+
         p->set_in_use();
       });
       return p.unsafe_ptr();
diff --git a/src/snmalloc/mem/remotecache.h b/src/snmalloc/mem/remotecache.h
index 3d5ed70b8..ec60839f4 100644
--- a/src/snmalloc/mem/remotecache.h
+++ b/src/snmalloc/mem/remotecache.h
@@ -194,18 +194,19 @@ namespace snmalloc
 
     RemoteDeallocCacheBatchingImpl<Config> batching;
 
+    static inline Stat remote_inflight;
+
     /**
-     * The total amount of memory we are waiting for before we will dispatch
-     * to other allocators. Zero can mean we have not initialised the allocator
-     * yet. This is initialised to the 0 so that we always hit a slow path to
-     * start with, when we hit the slow path and need to dispatch everything, we
-     * can check if we are a real allocator and lazily provide a real allocator.
+     * The total amount of bytes of memory in the cache.
+     *
+     * REMOTE_CACHE is used as the initial value, so that we always hit a slow
+     * path to start with, when we hit the slow path and need to dispatch
+     * everything, we can check if we are a real allocator and lazily provide a
+     * real allocator.
      */
-    int64_t capacity{0};
+    size_t cache_bytes{REMOTE_CACHE};
 
-#ifndef NDEBUG
     bool initialised = false;
-#endif
 
     /// Used to find the index into the array of queues for remote
     /// deallocation
@@ -233,13 +234,23 @@ namespace snmalloc
     {
       static_assert(sizeof(n) * 8 > MAX_CAPACITY_BITS);
 
-      auto size =
-        n * static_cast<int64_t>(sizeclass_full_to_size(entry.get_sizeclass()));
+      size_t size = n * sizeclass_full_to_size(entry.get_sizeclass());
+
+      size_t new_cache_bytes = cache_bytes + size;
+      if (SNMALLOC_UNLIKELY(new_cache_bytes > REMOTE_CACHE))
+      {
+        // Check if this is the default allocator, and if not, we
+        // can update the state.
+        if (initialised)
+        {
+          cache_bytes = new_cache_bytes;
+        }
 
-      bool result = capacity > size;
-      if (result)
-        capacity -= size;
-      return result;
+        return false;
+      }
+
+      cache_bytes = new_cache_bytes;
+      return true;
     }
 
     template<size_t allocator_size>
@@ -288,6 +299,9 @@ namespace snmalloc
                              return capptr_domesticate<Config>(local_state, p);
                            };
 
+      // We are about to post cache_bytes bytes to other allocators.
+      remote_inflight += cache_bytes;
+
       batching.close_all([this](
                            RemoteAllocator::alloc_id_t target_id,
                            capptr::Alloc<RemoteMessage> msg) {
@@ -356,8 +370,8 @@ namespace snmalloc
         }
       }
 
-      // Reset capacity as we have emptied everything
-      capacity = REMOTE_CACHE;
+      // Reset capacity as we have empty everything
+      cache_bytes = 0;
 
       return sent_something;
     }
@@ -373,18 +387,16 @@ namespace snmalloc
      */
     void init()
     {
-#ifndef NDEBUG
       initialised = true;
-#endif
+
       for (auto& l : list)
       {
         // We do not need to initialise with a particular slab, so pass
         // a null address.
         l.init(0, RemoteAllocator::key_global, NO_KEY_TWEAK);
       }
-      capacity = REMOTE_CACHE;
-
       batching.init();
+      cache_bytes = 0;
     }
   };
 } // namespace snmalloc
diff --git a/src/snmalloc/stl/gnu/atomic.h b/src/snmalloc/stl/gnu/atomic.h
index 7a193972e..bb7145f65 100644
--- a/src/snmalloc/stl/gnu/atomic.h
+++ b/src/snmalloc/stl/gnu/atomic.h
@@ -63,6 +63,11 @@ namespace snmalloc
         return __builtin_addressof(ref);
       }
 
+      SNMALLOC_FAST_PATH static const T* addressof(const T& ref)
+      {
+        return __builtin_addressof(ref);
+      }
+
       // From libc++:
       // require types that are 1, 2, 4, 8, or 16 bytes in length to be aligned
       // to at least their size to be potentially
@@ -89,7 +94,8 @@ namespace snmalloc
         return load();
       }
 
-      SNMALLOC_FAST_PATH T load(MemoryOrder mem_ord = MemoryOrder::SEQ_CST)
+      SNMALLOC_FAST_PATH T
+      load(MemoryOrder mem_ord = MemoryOrder::SEQ_CST) const
       {
         T res;
         __atomic_load(addressof(val), addressof(res), order(mem_ord));
diff --git a/src/test/func/alloc_churn/alloc_churn.cc b/src/test/func/alloc_churn/alloc_churn.cc
new file mode 100644
index 000000000..ebfe87774
--- /dev/null
+++ b/src/test/func/alloc_churn/alloc_churn.cc
@@ -0,0 +1,34 @@
+#include "snmalloc/snmalloc.h"
+
+#include <iostream>
+
+void test_step()
+{
+  auto b = snmalloc::get_scoped_allocator();
+  auto a = snmalloc::get_scoped_allocator();
+
+  for (size_t j = 0; j < 32; j++)
+    for (size_t i = 0; i < 20; i++)
+    {
+      auto p = a->alloc(snmalloc::bits::one_at_bit(i));
+      if (p != nullptr)
+        b->dealloc(p);
+      p = b->alloc(snmalloc::bits::one_at_bit(i));
+      if (p != nullptr)
+        a->dealloc(p);
+    }
+}
+
+int main()
+{
+  for (size_t i = 0; i < 1000; i++)
+  {
+    if (i % 100 == 0)
+    {
+      std::cout << "Step " << i << std::endl;
+      snmalloc::print_alloc_stats();
+      snmalloc::debug_check_empty();
+    }
+    test_step();
+  }
+}
\ No newline at end of file
diff --git a/src/test/func/cleanup/cleanup.cc b/src/test/func/cleanup/cleanup.cc
new file mode 100644
index 000000000..5e3666dc8
--- /dev/null
+++ b/src/test/func/cleanup/cleanup.cc
@@ -0,0 +1,61 @@
+#include <iostream>
+#include <snmalloc/snmalloc.h>
+#include <thread>
+#include <vector>
+
+void ecall()
+{
+  auto a = snmalloc::get_scoped_allocator();
+  std::vector<void*> allocs;
+  for (size_t j = 0; j < 1000; j++)
+  {
+    allocs.push_back(a->alloc(j % 1024));
+  }
+  auto p = a->alloc(1 * 1024 * 1024);
+  memset(p, 0, 1 * 1024 * 1024);
+
+  for (size_t j = 0; j < allocs.size(); j++)
+    a->dealloc(allocs[j]);
+
+  a->dealloc(p);
+}
+
+void thread_body()
+{
+  for (int i = 0; i < 1000; i++)
+  {
+    ecall();
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+}
+
+void monitor_body()
+{
+  for (int i = 0; i < 60; i++)
+  {
+    std::cout << "Current: "
+              << snmalloc::Alloc::Config::Backend::get_current_usage()
+              << std::endl;
+    std::cout << "Peak   : "
+              << snmalloc::Alloc::Config::Backend::get_peak_usage()
+              << std::endl;
+    std::cout << "Allocs : " << snmalloc::Alloc::Config::pool().get_count()
+              << std::endl;
+    std::cout << "--------------------------------------------" << std::endl;
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+  }
+}
+
+int main()
+{
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 8; i++)
+  {
+    threads.push_back(std::thread(thread_body));
+  }
+  threads.push_back(std::thread(monitor_body));
+
+  for (auto& t : threads)
+    t.join();
+  return 0;
+}
\ No newline at end of file
diff --git a/src/test/func/memory/memory.cc b/src/test/func/memory/memory.cc
index 891737843..9f876d459 100644
--- a/src/test/func/memory/memory.cc
+++ b/src/test/func/memory/memory.cc
@@ -558,7 +558,7 @@ int main(int argc, char** argv)
 #endif
 #define TEST(testname) \
   std::cout << "Running " #testname << std::endl; \
-  for (size_t i = 0; i < 100; i++) \
+  for (size_t i = 0; i < 50; i++) \
     testname();
 
   TEST(test_alloc_dealloc_64k);
diff --git a/src/test/func/statistics/stats.cc b/src/test/func/statistics/stats.cc
index d66f060a1..3bedcd55b 100644
--- a/src/test/func/statistics/stats.cc
+++ b/src/test/func/statistics/stats.cc
@@ -11,6 +11,7 @@ void debug_check_empty_1()
   auto r = snmalloc::alloc(size);
 
   snmalloc::debug_check_empty(&result);
+  snmalloc::print_alloc_stats();
   if (result != false)
   {
     std::cout << "debug_check_empty failed to detect leaked memory:" << size
@@ -18,8 +19,12 @@ void debug_check_empty_1()
     abort();
   }
 
+  snmalloc::print_alloc_stats();
+
   snmalloc::dealloc(r);
 
+  snmalloc::print_alloc_stats();
+
   snmalloc::debug_check_empty(&result);
   if (result != true)
   {
@@ -27,8 +32,12 @@ void debug_check_empty_1()
     abort();
   }
 
+  snmalloc::print_alloc_stats();
+
   r = snmalloc::alloc(size);
 
+  snmalloc::print_alloc_stats();
+
   snmalloc::debug_check_empty(&result);
   if (result != false)
   {
@@ -37,14 +46,20 @@ void debug_check_empty_1()
     abort();
   }
 
+  snmalloc::print_alloc_stats();
+
   snmalloc::dealloc(r);
 
+  snmalloc::print_alloc_stats();
+
   snmalloc::debug_check_empty(&result);
   if (result != true)
   {
     std::cout << "debug_check_empty failed to say empty:" << size << std::endl;
     abort();
   }
+
+  snmalloc::print_alloc_stats();
 }
 
 template<size_t size>
diff --git a/src/test/perf/batchblitz/batchblitz.cc b/src/test/perf/batchblitz/batchblitz.cc
new file mode 100644
index 000000000..3dce75353
--- /dev/null
+++ b/src/test/perf/batchblitz/batchblitz.cc
@@ -0,0 +1,92 @@
+#include <atomic>
+#include <snmalloc/snmalloc.h>
+#include <thread>
+#include <vector>
+
+size_t threads{0};
+size_t memory{0};
+size_t iterations{0};
+
+// Global barrier for synchronising threads.
+std::atomic<size_t> barrier{0};
+std::atomic<size_t> incarnation{0};
+
+std::atomic<bool> stop{false};
+
+std::vector<std::vector<void*>> allocations;
+
+NOINLINE bool wait()
+{
+  auto old_incarnation = incarnation.load();
+  // Register we have arrived at the barrier.
+  if (--barrier == 0)
+  {
+    printf(".");
+    fflush(stdout);
+    barrier = threads;
+    incarnation++;
+    return stop;
+  }
+
+  while (incarnation.load() == old_incarnation)
+  {
+    if (stop)
+      return true;
+    snmalloc::Aal::pause();
+  }
+
+  return stop;
+}
+
+void thread_func(size_t tid)
+{
+  size_t size = 4097;
+  size_t mem = memory / size;
+  for (size_t j = 0; j < iterations; j++)
+  {
+    if (wait())
+      return;
+    std::vector<void*>& allocs = allocations[tid];
+    for (size_t i = 0; i < mem; i++)
+    {
+      allocs.push_back(snmalloc::alloc(4097));
+    }
+    if (wait())
+      return;
+    std::vector<void*>& deallocs = allocations[(tid + 1) % threads];
+    for (auto p : deallocs)
+    {
+      snmalloc::dealloc(p);
+    }
+    deallocs.clear();
+  }
+}
+
+int main()
+{
+  threads = std::thread::hardware_concurrency();
+  barrier = threads;
+
+  if (snmalloc::DefaultPal::address_bits == 32)
+    memory = snmalloc::bits::one_at_bit(30) / threads;
+  else
+    memory = snmalloc::bits::one_at_bit(32) / threads;
+  iterations = 1000;
+
+  for (size_t i = 0; i < threads; i++)
+    allocations.emplace_back();
+
+  std::vector<std::thread> thread_pool;
+  for (size_t i = 0; i < threads; i++)
+    thread_pool.emplace_back(thread_func, i);
+
+  for (size_t i = 0; i < 30; i++)
+  {
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+    snmalloc::print_alloc_stats();
+  }
+  stop = true;
+
+  for (auto& t : thread_pool)
+    t.join();
+}
diff --git a/src/test/perf/churn/churn.cc b/src/test/perf/churn/churn.cc
new file mode 100644
index 000000000..910204a1d
--- /dev/null
+++ b/src/test/perf/churn/churn.cc
@@ -0,0 +1,97 @@
+#include <iostream>
+#include <queue>
+#include <snmalloc/snmalloc.h>
+#include <thread>
+#include <vector>
+
+int main()
+{
+  std::vector<std::thread> threads;
+  std::atomic<size_t> running;
+  snmalloc::Stat requests;
+  std::atomic<bool> done{false};
+
+  for (size_t i = 0; i < 16; i++)
+  {
+    threads.push_back(std::thread([&running, &requests, &done]() {
+      std::queue<size_t*> q;
+      while (!done)
+      {
+        snmalloc::ScopedAllocator alloc;
+        running++;
+
+        if (rand() % 1000 == 0)
+        {
+          // Deallocate everything in the queue
+          while (q.size() > 0)
+          {
+            auto p = q.front();
+            requests -= *p;
+            alloc->dealloc(p);
+            q.pop();
+          }
+        }
+
+        for (size_t j = 0; j < 1000; j++)
+        {
+          if (q.size() >= 20000 || (q.size() > 0 && (rand() % 10 == 0)))
+          {
+            auto p = q.front();
+            requests -= *p;
+            alloc->dealloc(p);
+            q.pop();
+          }
+          else
+          {
+            size_t size =
+              (rand() % 1024 == 0) ? 16 * 1024 * (1 << (rand() % 3)) : 48;
+            requests += size;
+            auto p = (size_t*)alloc->alloc(size);
+            *p = size;
+            q.push(p);
+          }
+        }
+
+        running--;
+        std::this_thread::sleep_for(std::chrono::microseconds(rand() % 2000));
+      }
+    }));
+  }
+
+  std::thread([&requests]() {
+    size_t count = 0;
+    while (count < 60)
+    {
+      count++;
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+      // std::cout << "Inflight:            " <<
+      // snmalloc::RemoteDeallocCache<snmalloc::Config>::remote_inflight <<
+      // std::endl; std::cout
+      // << "Current reservation: " << snmalloc::Globals::get_current_usage() <<
+      // std::endl; std::cout << "Peak reservation:    " <<
+      // snmalloc::Globals::get_peak_usage() << std::endl; std::cout <<
+      // "Allocator count:     " << snmalloc::Globals::pool().get_count() <<
+      // std::endl; std::cout << "Running threads:     " << running <<
+      // std::endl; std::cout << "Index:               " << count << std::endl;
+      // std::cout << "------------------------------------------" << std::endl;
+      std::cout
+        << count << "," << snmalloc::Alloc::Config::Backend::get_peak_usage()
+        << "," << snmalloc::Alloc::Config::Backend::get_current_usage() << ","
+        << requests.get_curr() << "," << requests.get_peak() << ","
+        << snmalloc::RemoteDeallocCache<snmalloc::Config>::remote_inflight
+             .get_peak()
+        << ","
+        << snmalloc::RemoteDeallocCache<snmalloc::Config>::remote_inflight
+             .get_curr()
+        << std::endl;
+      snmalloc::print_alloc_stats();
+    }
+  }).join();
+
+  done = true;
+
+  for (auto& t : threads)
+    t.join();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/src/test/perf/combininglock/combininglock.cc b/src/test/perf/combininglock/combininglock.cc
new file mode 100644
index 000000000..6a9437c70
--- /dev/null
+++ b/src/test/perf/combininglock/combininglock.cc
@@ -0,0 +1,37 @@
+#include <snmalloc/snmalloc.h>
+#include <thread>
+#include <vector>
+
+snmalloc::CombiningLock cl;
+
+std::atomic<bool> run{true};
+
+void loop()
+{
+  size_t j = 0;
+  size_t i = 0;
+  while (run)
+  {
+    i++;
+    snmalloc::with(cl, [&]() { j++; });
+    if (i != j)
+      snmalloc::error("i != j");
+  }
+}
+
+int main()
+{
+  std::vector<std::thread> threads;
+  for (size_t i = 0; i < 8; i++)
+  {
+    threads.emplace_back(std::thread(loop));
+  }
+
+  std::this_thread::sleep_for(std::chrono::seconds(100));
+  run = false;
+
+  for (auto& t : threads)
+  {
+    t.join();
+  }
+}
\ No newline at end of file
diff --git a/src/test/perf/realloc/realloc.cc b/src/test/perf/realloc/realloc.cc
new file mode 100644
index 000000000..5efcfbaeb
--- /dev/null
+++ b/src/test/perf/realloc/realloc.cc
@@ -0,0 +1,46 @@
+#include "test/opt.h"
+#include "test/setup.h"
+#include "test/usage.h"
+#include "test/xoroshiro.h"
+
+#include <algorithm>
+#include <iostream>
+#include <snmalloc/snmalloc.h>
+#include <thread>
+#include <vector>
+
+using namespace snmalloc;
+
+NOINLINE
+void* myrealloc(void* p, size_t size)
+{
+  return snmalloc::libc::realloc(p, size);
+}
+
+void grow()
+{
+  void* base = nullptr;
+  for (size_t i = 1; i < 1000; i++)
+  {
+    base = myrealloc(base, i * 8);
+  }
+  snmalloc::libc::free(base);
+}
+
+int main()
+{
+  auto start = Aal::tick();
+
+  for (size_t i = 0; i < 10000; i++)
+  {
+    grow();
+    if (i % 10 == 0)
+    {
+      std::cout << "." << std::flush;
+    }
+  }
+
+  auto end = Aal::tick();
+
+  std::cout << "Taken: " << end - start << std::endl;
+}
\ No newline at end of file