From cd61cfaa32d5da2d3d5cf6aa548b7cf50c828b90 Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Tue, 24 Mar 2026 10:39:30 +0100
Subject: [PATCH 1/8] [hist] Restrict atomic functions to arithmetic types

---
 hist/histv7/inc/ROOT/RHistUtils.hxx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/hist/histv7/inc/ROOT/RHistUtils.hxx b/hist/histv7/inc/ROOT/RHistUtils.hxx
index 452819ab4739b..8c66495b19df6 100644
--- a/hist/histv7/inc/ROOT/RHistUtils.hxx
+++ b/hist/histv7/inc/ROOT/RHistUtils.hxx
@@ -142,7 +142,7 @@ struct AtomicOps<8> {
 #endif
 
 template <typename T>
-void AtomicLoad(const T *ptr, T *ret)
+std::enable_if_t<std::is_arithmetic_v<T>> AtomicLoad(const T *ptr, T *ret)
 {
 #ifndef _MSC_VER
    __atomic_load(ptr, ret, __ATOMIC_RELAXED);
@@ -152,7 +152,7 @@ void AtomicLoad(const T *ptr, T *ret)
 }
 
 template <typename T>
-void AtomicStoreRelease(T *ptr, T *val)
+std::enable_if_t<std::is_arithmetic_v<T>> AtomicStoreRelease(T *ptr, T *val)
 {
 #ifndef _MSC_VER
    __atomic_store(ptr, val, __ATOMIC_RELEASE);
@@ -164,7 +164,7 @@ void AtomicStoreRelease(T *ptr, T *val)
 }
 
 template <typename T>
-bool AtomicCompareExchange(T *ptr, T *expected, T *desired)
+std::enable_if_t<std::is_arithmetic_v<T>, bool> AtomicCompareExchange(T *ptr, T *expected, T *desired)
 {
 #ifndef _MSC_VER
    return __atomic_compare_exchange(ptr, expected, desired, /*weak=*/false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
@@ -174,7 +174,7 @@ bool AtomicCompareExchange(T *ptr, T *expected, T *desired)
 }
 
 template <typename T>
-bool AtomicCompareExchangeAcquire(T *ptr, T *expected, T *desired)
+std::enable_if_t<std::is_arithmetic_v<T>, bool> AtomicCompareExchangeAcquire(T *ptr, T *expected, T *desired)
 {
 #ifndef _MSC_VER
    return __atomic_compare_exchange(ptr, expected, desired, /*weak=*/false, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
@@ -187,7 +187,7 @@ bool AtomicCompareExchangeAcquire(T *ptr, T *expected, T *desired)
 }
 
 template <typename T>
-void AtomicAddCompareExchangeLoop(T *ptr, T val)
+std::enable_if_t<std::is_arithmetic_v<T>> AtomicAddCompareExchangeLoop(T *ptr, T val)
 {
    T expected;
    AtomicLoad(ptr, &expected);

From 811e61562fc4d3f2750809322584f7f939d0afd3 Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Tue, 24 Mar 2026 13:07:05 +0100
Subject: [PATCH 2/8] [hist] Add more microbenchmarks for atomic operations

---
 .../benchmark/hist_benchmark_atomic.cxx       | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/hist/histv7/benchmark/hist_benchmark_atomic.cxx b/hist/histv7/benchmark/hist_benchmark_atomic.cxx
index e18fa0016630e..f6d5b67027ac9 100644
--- a/hist/histv7/benchmark/hist_benchmark_atomic.cxx
+++ b/hist/histv7/benchmark/hist_benchmark_atomic.cxx
@@ -38,6 +38,26 @@ BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicAdd)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicAdd);
 
+BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicLoad)(benchmark::State &state)
+{
+   int load;
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicLoad(&fAtomic, &load);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicLoad);
+
+BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicStoreRelease)(benchmark::State &state)
+{
+   int store = 1;
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicStoreRelease(&fAtomic, &store);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicStoreRelease);
+
 struct RHistAtomic_float : public benchmark::Fixture {
    float fAtomic = 0;
 };
@@ -60,6 +80,26 @@ BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicAdd)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicAdd);
 
+BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicLoad)(benchmark::State &state)
+{
+   float load;
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicLoad(&fAtomic, &load);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicLoad);
+
+BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicStoreRelease)(benchmark::State &state)
+{
+   float store = 1.0f;
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicStoreRelease(&fAtomic, &store);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicStoreRelease);
+
 struct RHistAtomic_double : public benchmark::Fixture {
    double fAtomic = 0;
 };
@@ -82,6 +122,26 @@ BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicAdd)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicAdd);
 
+BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicLoad)(benchmark::State &state)
+{
+   double load;
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicLoad(&fAtomic, &load);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicLoad);
+
+BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicStoreRelease)(benchmark::State &state)
+{
+   double store = 1.0;
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicStoreRelease(&fAtomic, &store);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicStoreRelease);
+
 struct RBinWithError : public benchmark::Fixture {
    ROOT::Experimental::RBinWithError fBin;
 };

From aa8814d158aeb86a231b2296664d4199b26a1101 Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Mon, 23 Mar 2026 11:38:17 +0100
Subject: [PATCH 3/8] [hist] Explain release-acquire ordering in RBinWithError

---
 hist/histv7/inc/ROOT/RBinWithError.hxx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hist/histv7/inc/ROOT/RBinWithError.hxx b/hist/histv7/inc/ROOT/RBinWithError.hxx
index 44ccd2652348d..2de3ce30fda7b 100644
--- a/hist/histv7/inc/ROOT/RBinWithError.hxx
+++ b/hist/histv7/inc/ROOT/RBinWithError.hxx
@@ -74,7 +74,9 @@ private:
             Internal::AtomicLoad(&fSum2, &origSum2);
          }
 
-         // The variable appears to be unlocked, confirm with a compare-exchange.
+         // The variable appears to be unlocked, confirm with a compare-exchange. It uses acquire memory order in case
+         // of success, so the release store synchronizes with this load. This ensures that the previous write of fSum
+         // becomes a visible side-effect in this thread and the access below will see it.
          double negated = std::copysign(origSum2, -1.0);
          if (Internal::AtomicCompareExchangeAcquire(&fSum2, &origSum2, &negated)) {
             break;
@@ -85,6 +87,7 @@ private:
       fSum += a;
 
       double sum2 = origSum2 + a2;
+      // This store synchronizes with the compare-exchange, see above.
       Internal::AtomicStoreRelease(&fSum2, &sum2);
    }
 

From 2e5d8065e66ea8ddaca9ac693b8197c022a63a27 Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Fri, 20 Mar 2026 13:48:15 +0100
Subject: [PATCH 4/8] [hist] Implement RBinWithError::AtomicLoad

After the changes introduced in commit 43566dc589 ("Ensure consistency
of RBinWithError::AtomicAdd"), it is possible to detect
a. ongoing updates if fSum2 is negative, and
b. inconsistent reads if the value of fSum2 changes.
---
 .../benchmark/hist_benchmark_atomic.cxx       | 10 ++++
 hist/histv7/inc/ROOT/RBinWithError.hxx        | 27 +++++++++++
 hist/histv7/test/hist_atomic.cxx              | 46 +++++++++++++++++++
 3 files changed, 83 insertions(+)

diff --git a/hist/histv7/benchmark/hist_benchmark_atomic.cxx b/hist/histv7/benchmark/hist_benchmark_atomic.cxx
index f6d5b67027ac9..a31044297b45c 100644
--- a/hist/histv7/benchmark/hist_benchmark_atomic.cxx
+++ b/hist/histv7/benchmark/hist_benchmark_atomic.cxx
@@ -182,4 +182,14 @@ BENCHMARK_DEFINE_F(RBinWithError, AtomicAdd)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RBinWithError, AtomicAdd);
 
+BENCHMARK_DEFINE_F(RBinWithError, AtomicLoad)(benchmark::State &state)
+{
+   ROOT::Experimental::RBinWithError load;
+   for (auto _ : state) {
+      fBin.AtomicLoad(&load);
+      benchmark::DoNotOptimize(load);
+   }
+}
+BENCHMARK_REGISTER_F(RBinWithError, AtomicLoad);
+
 BENCHMARK_MAIN();
diff --git a/hist/histv7/inc/ROOT/RBinWithError.hxx b/hist/histv7/inc/ROOT/RBinWithError.hxx
index 2de3ce30fda7b..82214a84c2575 100644
--- a/hist/histv7/inc/ROOT/RBinWithError.hxx
+++ b/hist/histv7/inc/ROOT/RBinWithError.hxx
@@ -8,6 +8,7 @@
 #include "RHistUtils.hxx"
 
 #include <cmath>
+#include <cstring>
 
 namespace ROOT {
 namespace Experimental {
@@ -100,6 +101,32 @@ public:
    ///
    /// \param[in] rhs another bin content that must not be modified during the operation
    void AtomicAdd(const RBinWithError &rhs) { AtomicAdd(rhs.fSum, rhs.fSum2); }
+
+   void AtomicLoad(RBinWithError *ret) const
+   {
+      double origSum2;
+      Internal::AtomicLoad(&fSum2, &origSum2);
+
+      while (true) {
+         // Repeat loads from memory until we see a non-negative value.
+         // NB: do not use origSum2 < 0, it does not work for -0.0!
+         while (std::signbit(origSum2)) {
+            Internal::AtomicLoad(&fSum2, &origSum2);
+         }
+
+         Internal::AtomicLoad(&fSum, &ret->fSum);
+         Internal::AtomicLoad(&fSum2, &ret->fSum2);
+
+         // Check if the value of fSum2 is still identical to the first load.
+         // NB: do not use origSum2 == ret->fSum2, it is problematic because 0.0 == -0.0!
+         if (!std::memcmp(&origSum2, &ret->fSum2, sizeof(origSum2))) {
+            return;
+         }
+
+         // The comparison failed, an update happened or is in progress.
+         origSum2 = ret->fSum2;
+      }
+   }
 };
 
 } // namespace Experimental
diff --git a/hist/histv7/test/hist_atomic.cxx b/hist/histv7/test/hist_atomic.cxx
index b31c55c19a0d9..26a544d405bba 100644
--- a/hist/histv7/test/hist_atomic.cxx
+++ b/hist/histv7/test/hist_atomic.cxx
@@ -1,5 +1,6 @@
 #include "hist_test.hxx"
 
+#include <atomic>
 #include <cstddef>
 
 #ifndef TYPED_TEST_SUITE
@@ -81,3 +82,48 @@ TEST(RBinWithError, StressAtomicAdd)
    EXPECT_EQ(bin.fSum, NAdds * Addend);
    EXPECT_EQ(bin.fSum2, NAdds * Addend * Addend);
 }
+
+TEST(RBinWithError, AtomicLoad)
+{
+   RBinWithError bin;
+   bin.fSum = 1;
+   bin.fSum2 = 2;
+
+   RBinWithError load;
+   bin.AtomicLoad(&load);
+   EXPECT_EQ(load.fSum, 1);
+   EXPECT_EQ(load.fSum2, 2);
+}
+
+TEST(RBinWithError, StressAtomicLoad)
+{
+   static constexpr double Addend = 2.0;
+   static constexpr std::size_t NThreads = 4;
+   static constexpr std::size_t NAddsPerThread = 8000;
+   static constexpr std::size_t NLoads = 8000;
+
+   RBinWithError bin;
+   double sum = 0, sum2 = 0;
+
+   std::atomic_flag loader;
+   StressInParallel(NThreads, [&] {
+      if (!loader.test_and_set()) {
+         RBinWithError load;
+         for (std::size_t i = 0; i < NLoads; i++) {
+            bin.AtomicLoad(&load);
+            if (load.fSum * Addend != load.fSum2) {
+               // Failure! Store the values and stop the loader.
+               sum = load.fSum;
+               sum2 = load.fSum2;
+               return;
+            }
+         }
+      } else {
+         for (std::size_t i = 0; i < NAddsPerThread; i++) {
+            bin.AtomicAdd(Addend);
+         }
+      }
+   });
+
+   EXPECT_EQ(sum * Addend, sum2);
+}

From aa78ec9cf182a85d1e3120cbf5af1aaaaf25e01d Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Tue, 24 Mar 2026 09:07:27 +0100
Subject: [PATCH 5/8] [hist] Use LoadAcquire in RBinWithError::AtomicLoad

This ensures correctness on weakly-ordered systems. This also requires
a store release of fSum in AtomicAdd to make sure a previous write of
fSum2 is visible in AtomicLoad.
---
 .../benchmark/hist_benchmark_atomic.cxx       | 30 +++++++++++++++++++
 hist/histv7/inc/ROOT/RBinWithError.hxx        | 25 ++++++++++++----
 hist/histv7/inc/ROOT/RHistUtils.hxx           | 12 ++++++++
 3 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/hist/histv7/benchmark/hist_benchmark_atomic.cxx b/hist/histv7/benchmark/hist_benchmark_atomic.cxx
index a31044297b45c..02fe03a14eb43 100644
--- a/hist/histv7/benchmark/hist_benchmark_atomic.cxx
+++ b/hist/histv7/benchmark/hist_benchmark_atomic.cxx
@@ -48,6 +48,16 @@ BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicLoad)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicLoad);
 
+BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicLoadAcquire)(benchmark::State &state)
+{
+   int load;
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicLoadAcquire(&fAtomic, &load);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicLoadAcquire);
+
 BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicStoreRelease)(benchmark::State &state)
 {
    int store = 1;
@@ -90,6 +100,16 @@ BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicLoad)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicLoad);
 
+BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicLoadAcquire)(benchmark::State &state)
+{
+   float load;
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicLoadAcquire(&fAtomic, &load);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicLoadAcquire);
+
 BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicStoreRelease)(benchmark::State &state)
 {
    float store = 1.0f;
@@ -132,6 +152,16 @@ BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicLoad)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicLoad);
 
+BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicLoadAcquire)(benchmark::State &state)
+{
+   double load;
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicLoadAcquire(&fAtomic, &load);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicLoadAcquire);
+
 BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicStoreRelease)(benchmark::State &state)
 {
    double store = 1.0;
diff --git a/hist/histv7/inc/ROOT/RBinWithError.hxx b/hist/histv7/inc/ROOT/RBinWithError.hxx
index 82214a84c2575..5127803401d6b 100644
--- a/hist/histv7/inc/ROOT/RBinWithError.hxx
+++ b/hist/histv7/inc/ROOT/RBinWithError.hxx
@@ -84,8 +84,12 @@ private:
          }
       }
 
-      // By using a spin lock, we do not need atomic operations for fSum.
-      fSum += a;
+      // By using a spin lock, we would not need atomic operations for fSum. However, we must use a release store to
+      // guarantee correctness of AtomicLoad.
+      double sum;
+      Internal::AtomicLoad(&fSum, &sum);
+      sum += a;
+      Internal::AtomicStoreRelease(&fSum, &sum);
 
       double sum2 = origSum2 + a2;
       // This store synchronizes with the compare-exchange, see above.
@@ -105,17 +109,26 @@ public:
    void AtomicLoad(RBinWithError *ret) const
    {
       double origSum2;
-      Internal::AtomicLoad(&fSum2, &origSum2);
+      Internal::AtomicLoadAcquire(&fSum2, &origSum2);
 
       while (true) {
          // Repeat loads from memory until we see a non-negative value.
          // NB: do not use origSum2 < 0, it does not work for -0.0!
          while (std::signbit(origSum2)) {
-            Internal::AtomicLoad(&fSum2, &origSum2);
+            Internal::AtomicLoadAcquire(&fSum2, &origSum2);
          }
 
-         Internal::AtomicLoad(&fSum, &ret->fSum);
-         Internal::AtomicLoad(&fSum2, &ret->fSum2);
+         // The release store to fSum2 at the end of AtomicAdd synchronizes with the last acquire load above. This
+         // ensures that the previous write of fSum becomes a visible side-effect in this thread and the atomic load
+         // below will see it.
+
+         Internal::AtomicLoadAcquire(&fSum, &ret->fSum);
+
+         // In the reverse direction, the release store of fSum synchronizes with the acquire load above. This ensures
+         // that the previous write of fSum2 becomes a visible side-effect and we would notice if the spinlock is taken.
+
+         // NB: an acquire load is required here in case the comparison fails below and we repeat the outer loop.
+         Internal::AtomicLoadAcquire(&fSum2, &ret->fSum2);
 
          // Check if the value of fSum2 is still identical to the first load.
          // NB: do not use origSum2 == ret->fSum2, it is problematic because 0.0 == -0.0!
diff --git a/hist/histv7/inc/ROOT/RHistUtils.hxx b/hist/histv7/inc/ROOT/RHistUtils.hxx
index 8c66495b19df6..0240b0d056f26 100644
--- a/hist/histv7/inc/ROOT/RHistUtils.hxx
+++ b/hist/histv7/inc/ROOT/RHistUtils.hxx
@@ -151,6 +151,18 @@ std::enable_if_t<std::is_arithmetic_v<T>> AtomicLoad(const T *ptr, T *ret)
 #endif
 }
 
+template <typename T>
+std::enable_if_t<std::is_arithmetic_v<T>> AtomicLoadAcquire(const T *ptr, T *ret)
+{
+#ifndef _MSC_VER
+   __atomic_load(ptr, ret, __ATOMIC_ACQUIRE);
+#else
+   MSVC::AtomicOps<sizeof(T)>::Load(ptr, ret);
+   // Cannot specify the memory order directly, use a fence.
+   std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+}
+
 template <typename T>
 std::enable_if_t<std::is_arithmetic_v<T>> AtomicStoreRelease(T *ptr, T *val)
 {

From a0570ecca6df4f6d148b7167112c9179374f7f3f Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Fri, 20 Mar 2026 15:38:26 +0100
Subject: [PATCH 6/8] [hist] Implement RHistEngine::SnapshotAtomic

It returns a consistent copy of the histogram, during concurrent
filling. The implementation is based on a successful double collect
as described by Afek et al in their 1993 paper "Atomic Snapshots of
Shared Memory." The idea is to iterate twice over all bins until no
change is observed.

This is guaranteed to work for unweighted filling because bin contents
are monotonically increasing. With weighted filling and potentially
negative weights, there could be the situation where a bin goes back
to its previous content and a change is missed. This is probably
unlikely in practice, or can be entirely avoided by using RBinWithError
that also tracks the sum of squares, which is monotonically increasing
even for negative weights.
---
 hist/histv7/inc/ROOT/RHistEngine.hxx    | 38 ++++++++++++++++
 hist/histv7/test/hist_engine_atomic.cxx | 58 +++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

diff --git a/hist/histv7/inc/ROOT/RHistEngine.hxx b/hist/histv7/inc/ROOT/RHistEngine.hxx
index f438e60b2c789..9e8ce6c19181f 100644
--- a/hist/histv7/inc/ROOT/RHistEngine.hxx
+++ b/hist/histv7/inc/ROOT/RHistEngine.hxx
@@ -20,6 +20,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 #include <initializer_list>
 #include <stdexcept>
 #include <tuple>
@@ -812,6 +813,43 @@ public:
       return Slice(sliceSpecs);
    }
 
+   /// Create an atomic snapshot of this histogram engine.
+   ///
+   /// A snapshot is a consistent copy of the histogram, during concurrent filling. It is guaranteed that the returned
+   /// copy represents a state between the begin and end of the snapshot operation.
+   ///
+   /// Snapshotting a histogram engine with many bins can be an expensive operation.
+   ///
+   /// \return the atomic snapshot
+   RHistEngine SnapshotAtomic() const
+   {
+      static_assert(std::is_trivially_copyable_v<BinContentType>,
+                    "snapshotting requires a trivially copyable bin content type");
+
+      RHistEngine snapshot(fAxes.Get());
+      // Do a first collect.
+      for (std::size_t i = 0; i < fBinContents.size(); i++) {
+         Internal::AtomicLoad(&fBinContents[i], &snapshot.fBinContents[i]);
+      }
+
+      // Now do another collect. If no change is detected, the snapshot is consistent. Otherwise update the bin contents
+      // and try again.
+      BinContentType tmp;
+      bool changed;
+      do {
+         changed = false;
+         for (std::size_t i = 0; i < fBinContents.size(); i++) {
+            Internal::AtomicLoad(&fBinContents[i], &tmp);
+            if (std::memcmp(&tmp, &snapshot.fBinContents[i], sizeof(BinContentType))) {
+               std::memcpy(&snapshot.fBinContents[i], &tmp, sizeof(BinContentType));
+               changed = true;
+            }
+         }
+      } while (changed);
+
+      return snapshot;
+   }
+
    /// \}
 
    /// %ROOT Streamer function to throw when trying to store an object of this class.
diff --git a/hist/histv7/test/hist_engine_atomic.cxx b/hist/histv7/test/hist_engine_atomic.cxx
index 3d141ae7c1107..65d7011d0438f 100644
--- a/hist/histv7/test/hist_engine_atomic.cxx
+++ b/hist/histv7/test/hist_engine_atomic.cxx
@@ -267,6 +267,64 @@ TEST(RHistEngine, StressFillAddAtomicWeight)
    EXPECT_EQ(engineA.GetBinContent(0), NOps * Weight);
 }
 
+TEST(RHistEngine, SnapshotAtomic)
+{
+   static constexpr std::size_t Bins = 20;
+   const RRegularAxis axis(Bins, {0, Bins});
+   RHistEngine<int> engineA({axis});
+
+   engineA.Fill(-100);
+   for (std::size_t i = 0; i < Bins; i++) {
+      engineA.Fill(i + 0.5);
+   }
+   engineA.Fill(100);
+
+   RHistEngine<int> engineB = engineA.SnapshotAtomic();
+   ASSERT_EQ(engineB.GetNDimensions(), 1);
+   ASSERT_EQ(engineB.GetTotalNBins(), Bins + 2);
+
+   EXPECT_EQ(engineB.GetBinContent(RBinIndex::Underflow()), 1);
+   for (auto index : axis.GetNormalRange()) {
+      EXPECT_EQ(engineB.GetBinContent(index), 1);
+   }
+   EXPECT_EQ(engineB.GetBinContent(RBinIndex::Overflow()), 1);
+}
+
+TEST(RHistEngine, StressSnapshotAtomic)
+{
+   static constexpr std::size_t Bins = 20;
+   static constexpr std::size_t NThreads = 4;
+   static constexpr std::size_t NFillsPerThread = 100000;
+   static constexpr std::size_t NSnapshots = 10000;
+
+   // Create a histogram with some bins that takes a bit of time to snapshot. The idea of this stress test is then to
+   // fill the first and last bin from multiple threads. If the snapshot is consistent, it must never have a bigger bin
+   // content in the last bin.
+   RHistEngine<int> engine(Bins, {0, Bins});
+   int first = 0, last = 0;
+
+   std::atomic_flag snapshotter;
+   StressInParallel(NThreads, [&] {
+      if (!snapshotter.test_and_set()) {
+         for (std::size_t i = 0; i < NSnapshots; i++) {
+            RHistEngine<int> snapshot = engine.SnapshotAtomic();
+            first = snapshot.GetBinContent(0);
+            last = snapshot.GetBinContent(Bins - 1);
+            if (last > first) {
+               return;
+            }
+         }
+      } else {
+         for (std::size_t i = 0; i < NFillsPerThread; i++) {
+            engine.FillAtomic(0.5);
+            engine.FillAtomic(Bins - 0.5);
+         }
+      }
+   });
+
+   EXPECT_GE(first, last);
+}
+
 TEST(RHistEngine_RBinWithError, AddAtomic)
 {
    static constexpr std::size_t Bins = 20;

From ae96b561abe3a24fb2dd24d4265bd70594d81fdf Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Fri, 20 Mar 2026 15:49:25 +0100
Subject: [PATCH 7/8] [hist] Support snapshot with user-defined types

---
 hist/histv7/inc/ROOT/RHistUtils.hxx     |  6 +++
 hist/histv7/test/hist_engine_atomic.cxx | 58 +++++++++++++++++++++++++
 hist/histv7/test/hist_user.cxx          | 15 +++++++
 3 files changed, 79 insertions(+)

diff --git a/hist/histv7/inc/ROOT/RHistUtils.hxx b/hist/histv7/inc/ROOT/RHistUtils.hxx
index 0240b0d056f26..30ba986efa871 100644
--- a/hist/histv7/inc/ROOT/RHistUtils.hxx
+++ b/hist/histv7/inc/ROOT/RHistUtils.hxx
@@ -151,6 +151,12 @@ std::enable_if_t<std::is_arithmetic_v<T>> AtomicLoad(const T *ptr, T *ret)
 #endif
 }
 
+template <typename T>
+auto AtomicLoad(const T *ptr, T *ret) -> decltype(ptr->AtomicLoad(ret))
+{
+   return ptr->AtomicLoad(ret);
+}
+
 template <typename T>
 std::enable_if_t<std::is_arithmetic_v<T>> AtomicLoadAcquire(const T *ptr, T *ret)
 {
diff --git a/hist/histv7/test/hist_engine_atomic.cxx b/hist/histv7/test/hist_engine_atomic.cxx
index 65d7011d0438f..41e110740a587 100644
--- a/hist/histv7/test/hist_engine_atomic.cxx
+++ b/hist/histv7/test/hist_engine_atomic.cxx
@@ -441,3 +441,61 @@ TEST(RHistEngine_RBinWithError, StressFillAtomicWeight)
    EXPECT_EQ(engine.GetBinContent(0).fSum, NFills * Weight);
    EXPECT_EQ(engine.GetBinContent(0).fSum2, NFills * Weight * Weight);
 }
+
+TEST(RHistEngine_RBinWithError, SnapshotAtomic)
+{
+   static constexpr std::size_t Bins = 20;
+   const RRegularAxis axis(Bins, {0, Bins});
+   RHistEngine<RBinWithError> engineA({axis});
+
+   engineA.Fill(-100);
+   for (std::size_t i = 0; i < Bins; i++) {
+      engineA.Fill(i + 0.5);
+   }
+   engineA.Fill(100);
+
+   RHistEngine<RBinWithError> engineB = engineA.SnapshotAtomic();
+   ASSERT_EQ(engineB.GetNDimensions(), 1);
+   ASSERT_EQ(engineB.GetTotalNBins(), Bins + 2);
+
+   EXPECT_EQ(engineB.GetBinContent(RBinIndex::Underflow()).fSum, 1);
+   for (auto index : axis.GetNormalRange()) {
+      EXPECT_EQ(engineB.GetBinContent(index).fSum, 1);
+   }
+   EXPECT_EQ(engineB.GetBinContent(RBinIndex::Overflow()).fSum, 1);
+}
+
+TEST(RHistEngine_RBinWithError, StressSnapshotAtomic)
+{
+   static constexpr std::size_t Bins = 20;
+   static constexpr std::size_t NThreads = 4;
+   static constexpr std::size_t NFillsPerThread = 100000;
+   static constexpr std::size_t NSnapshots = 10000;
+
+   // Create a histogram with some bins that takes a bit of time to snapshot. The idea of this stress test is then to
+   // fill the first and last bin from multiple threads. If the snapshot is consistent, it must never have a bigger bin
+   // content in the last bin.
+   RHistEngine<RBinWithError> engine(Bins, {0, Bins});
+   double first = 0, last = 0;
+
+   std::atomic_flag snapshotter;
+   StressInParallel(NThreads, [&] {
+      if (!snapshotter.test_and_set()) {
+         for (std::size_t i = 0; i < NSnapshots; i++) {
+            RHistEngine<RBinWithError> snapshot = engine.SnapshotAtomic();
+            first = snapshot.GetBinContent(0).fSum;
+            last = snapshot.GetBinContent(Bins - 1).fSum;
+            if (last > first) {
+               return;
+            }
+         }
+      } else {
+         for (std::size_t i = 0; i < NFillsPerThread; i++) {
+            engine.FillAtomic(0.5);
+            engine.FillAtomic(Bins - 0.5);
+         }
+      }
+   });
+
+   EXPECT_GE(first, last);
+}
diff --git a/hist/histv7/test/hist_user.cxx b/hist/histv7/test/hist_user.cxx
index b5d348da0cdd9..462e8a7ed5a52 100644
--- a/hist/histv7/test/hist_user.cxx
+++ b/hist/histv7/test/hist_user.cxx
@@ -61,6 +61,8 @@ struct User {
    void AtomicAdd(const UserWeight &w) { ROOT::Experimental::Internal::AtomicAdd(&fValue, w.fWeight); }
 
    void AtomicAdd(const User &rhs) { ROOT::Experimental::Internal::AtomicAdd(&fValue, rhs.fValue); }
+
+   void AtomicLoad(User *ret) const { ROOT::Experimental::Internal::AtomicLoad(&fValue, &ret->fValue); }
 };
 
 static_assert(std::is_nothrow_move_constructible_v<RHistEngine<User>>);
@@ -274,3 +276,16 @@ TEST(RHistEngineUser, SetBinContent)
    engine.SetBinContent(indices, 43);
    EXPECT_EQ(engine.GetBinContent(indices).fValue, 43);
 }
+
+TEST(RHistEngineUser, SnapshotAtomic)
+{
+   // Snapshotting uses AtomicLoad.
+   static constexpr std::size_t Bins = 20;
+   const RRegularAxis axis(Bins, {0, Bins});
+   RHistEngine<User> engineA({axis});
+
+   engineA.Fill(8.5);
+
+   RHistEngine<User> engineB = engineA.SnapshotAtomic();
+   EXPECT_EQ(engineB.GetBinContent(8).fValue, 1);
+}

From c957255af6de22de7a3957ecfa5f7c3c710d2e73 Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Tue, 24 Mar 2026 13:47:55 +0100
Subject: [PATCH 8/8] [hist] Use release-acquire in SnapshotAtomic

This ensures correctness on weakly-ordered systems. If not using
SnapshotAtomic, release stores would not be needed in FillAtomic.
However, tests indicate that it's not measurable in practice so
we use it unconditionally to keep the implementation simple.
---
 .../benchmark/hist_benchmark_atomic.cxx       | 39 ++++++++++--
 hist/histv7/inc/ROOT/RBinWithError.hxx        |  4 +-
 hist/histv7/inc/ROOT/RHistEngine.hxx          | 13 ++--
 hist/histv7/inc/ROOT/RHistUtils.hxx           | 60 +++++++++++++++++--
 hist/histv7/test/hist_atomic.cxx              | 46 ++++++++++----
 hist/histv7/test/hist_user.cxx                | 12 ++--
 6 files changed, 140 insertions(+), 34 deletions(-)

diff --git a/hist/histv7/benchmark/hist_benchmark_atomic.cxx b/hist/histv7/benchmark/hist_benchmark_atomic.cxx
index 02fe03a14eb43..0196e663836fe 100644
--- a/hist/histv7/benchmark/hist_benchmark_atomic.cxx
+++ b/hist/histv7/benchmark/hist_benchmark_atomic.cxx
@@ -38,6 +38,15 @@ BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicAdd)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicAdd);
 
+BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicAddRelease)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicAddRelease(&fAtomic, 1);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicAddRelease);
+
 BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicLoad)(benchmark::State &state)
 {
    int load;
@@ -90,6 +99,15 @@ BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicAdd)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicAdd);
 
+BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicAddRelease)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicAddRelease(&fAtomic, 1.0f);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicAddRelease);
+
 BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicLoad)(benchmark::State &state)
 {
    float load;
@@ -142,6 +160,15 @@ BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicAdd)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicAdd);
 
+BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicAddRelease)(benchmark::State &state)
+{
+   for (auto _ : state) {
+      ROOT::Experimental::Internal::AtomicAddRelease(&fAtomic, 1.0);
+      benchmark::ClobberMemory();
+   }
+}
+BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicAddRelease);
+
 BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicLoad)(benchmark::State &state)
 {
    double load;
@@ -185,14 +212,14 @@ BENCHMARK_DEFINE_F(RBinWithError, Inc)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RBinWithError, Inc);
 
-BENCHMARK_DEFINE_F(RBinWithError, AtomicInc)(benchmark::State &state)
+BENCHMARK_DEFINE_F(RBinWithError, AtomicIncRelease)(benchmark::State &state)
 {
    for (auto _ : state) {
-      fBin.AtomicInc();
+      fBin.AtomicIncRelease();
       benchmark::ClobberMemory();
    }
 }
-BENCHMARK_REGISTER_F(RBinWithError, AtomicInc);
+BENCHMARK_REGISTER_F(RBinWithError, AtomicIncRelease);
 
 BENCHMARK_DEFINE_F(RBinWithError, Add)(benchmark::State &state)
 {
@@ -203,14 +230,14 @@ BENCHMARK_DEFINE_F(RBinWithError, Add)(benchmark::State &state)
 }
 BENCHMARK_REGISTER_F(RBinWithError, Add);
 
-BENCHMARK_DEFINE_F(RBinWithError, AtomicAdd)(benchmark::State &state)
+BENCHMARK_DEFINE_F(RBinWithError, AtomicAddRelease)(benchmark::State &state)
 {
    for (auto _ : state) {
-      fBin.AtomicAdd(1.0);
+      fBin.AtomicAddRelease(1.0);
       benchmark::ClobberMemory();
    }
 }
-BENCHMARK_REGISTER_F(RBinWithError, AtomicAdd);
+BENCHMARK_REGISTER_F(RBinWithError, AtomicAddRelease);
 
 BENCHMARK_DEFINE_F(RBinWithError, AtomicLoad)(benchmark::State &state)
 {
diff --git a/hist/histv7/inc/ROOT/RBinWithError.hxx b/hist/histv7/inc/ROOT/RBinWithError.hxx
index 5127803401d6b..de95950577d59 100644
--- a/hist/histv7/inc/ROOT/RBinWithError.hxx
+++ b/hist/histv7/inc/ROOT/RBinWithError.hxx
@@ -97,9 +97,9 @@ private:
    }
 
 public:
-   void AtomicInc() { AtomicAdd(1.0, 1.0); }
+   void AtomicIncRelease() { AtomicAdd(1.0, 1.0); }
 
-   void AtomicAdd(double w) { AtomicAdd(w, w * w); }
+   void AtomicAddRelease(double w) { AtomicAdd(w, w * w); }
 
    /// Add another bin content using atomic instructions.
    ///
diff --git a/hist/histv7/inc/ROOT/RHistEngine.hxx b/hist/histv7/inc/ROOT/RHistEngine.hxx
index 9e8ce6c19181f..6b77023a71eb1 100644
--- a/hist/histv7/inc/ROOT/RHistEngine.hxx
+++ b/hist/histv7/inc/ROOT/RHistEngine.hxx
@@ -17,6 +17,7 @@
 #include "RWeight.hxx"
 
 #include <array>
+#include <atomic>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -501,7 +502,7 @@ public:
       RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<sizeof...(A)>(args);
       if (index.fValid) {
          assert(index.fIndex < fBinContents.size());
-         Internal::AtomicInc(&fBinContents[index.fIndex]);
+         Internal::AtomicIncRelease(&fBinContents[index.fIndex]);
       }
    }
 
@@ -525,7 +526,7 @@ public:
       RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<sizeof...(A)>(args);
       if (index.fValid) {
          assert(index.fIndex < fBinContents.size());
-         Internal::AtomicAdd(&fBinContents[index.fIndex], weight.fValue);
+         Internal::AtomicAddRelease(&fBinContents[index.fIndex], weight.fValue);
       }
    }
 
@@ -550,7 +551,7 @@ public:
       RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<sizeof...(A)>(args);
       if (index.fValid) {
          assert(index.fIndex < fBinContents.size());
-         Internal::AtomicAdd(&fBinContents[index.fIndex], weight);
+         Internal::AtomicAddRelease(&fBinContents[index.fIndex], weight);
       }
    }
 
@@ -574,7 +575,7 @@ public:
             RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<N>(t);
             if (index.fValid) {
                assert(index.fIndex < fBinContents.size());
-               Internal::AtomicAdd(&fBinContents[index.fIndex], weight.fValue);
+               Internal::AtomicAddRelease(&fBinContents[index.fIndex], weight.fValue);
             }
          } else {
             FillAtomic(t);
@@ -837,6 +838,10 @@ public:
       BinContentType tmp;
       bool changed;
       do {
+         // To guarantee correctness, we let the release operation(s) in FillAtomic synchronize with this acquire fence.
+         // This ensures that all previous writes become visible side-effects and the atomic loads will see them.
+         std::atomic_thread_fence(std::memory_order_acquire);
+
          changed = false;
          for (std::size_t i = 0; i < fBinContents.size(); i++) {
             Internal::AtomicLoad(&fBinContents[i], &tmp);
diff --git a/hist/histv7/inc/ROOT/RHistUtils.hxx b/hist/histv7/inc/ROOT/RHistUtils.hxx
index 30ba986efa871..f045f58208843 100644
--- a/hist/histv7/inc/ROOT/RHistUtils.hxx
+++ b/hist/histv7/inc/ROOT/RHistUtils.hxx
@@ -204,6 +204,18 @@ std::enable_if_t<std::is_arithmetic_v<T>, bool> AtomicCompareExchangeAcquire(T *
 #endif
 }
 
+template <typename T>
+std::enable_if_t<std::is_arithmetic_v<T>, bool> AtomicCompareExchangeRelease(T *ptr, T *expected, T *desired)
+{
+#ifndef _MSC_VER
+   return __atomic_compare_exchange(ptr, expected, desired, /*weak=*/false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+#else
+   // Cannot specify the memory order directly, use an unconditional fence to avoid branching code.
+   std::atomic_thread_fence(std::memory_order_release);
+   return MSVC::AtomicOps<sizeof(T)>::CompareExchange(ptr, expected, desired);
+#endif
+}
+
 template <typename T>
 std::enable_if_t<std::is_arithmetic_v<T>> AtomicAddCompareExchangeLoop(T *ptr, T val)
 {
@@ -216,6 +228,18 @@ std::enable_if_t<std::is_arithmetic_v<T>> AtomicAddCompareExchangeLoop(T *ptr, T
    }
 }
 
+template <typename T>
+std::enable_if_t<std::is_arithmetic_v<T>> AtomicAddCompareExchangeReleaseLoop(T *ptr, T val)
+{
+   T expected;
+   AtomicLoad(ptr, &expected);
+   T desired = expected + val;
+   while (!AtomicCompareExchangeRelease(ptr, &expected, &desired)) {
+      // expected holds the new value; try again.
+      desired = expected + val;
+   }
+}
+
 #ifdef _MSC_VER
 namespace MSVC {
 inline void AtomicOps<8>::Add(void *ptr, const void *val)
@@ -245,17 +269,35 @@ std::enable_if_t<std::is_floating_point_v<T>> AtomicAdd(T *ptr, T val)
    AtomicAddCompareExchangeLoop(ptr, val);
 }
 
+template <typename T>
+std::enable_if_t<std::is_integral_v<T>> AtomicAddRelease(T *ptr, T val)
+{
+#ifndef _MSC_VER
+   __atomic_fetch_add(ptr, val, __ATOMIC_RELEASE);
+#else
+   // Cannot specify the memory order directly, use a fence.
+   std::atomic_thread_fence(std::memory_order_release);
+   MSVC::AtomicOps<sizeof(T)>::Add(ptr, &val);
+#endif
+}
+
+template <typename T>
+std::enable_if_t<std::is_floating_point_v<T>> AtomicAddRelease(T *ptr, T val)
+{
+   AtomicAddCompareExchangeReleaseLoop(ptr, val);
+}
+
 // For adding a double-precision weight to a single-precision bin content type, cast the argument once before the
 // compare-exchange loop.
-static inline void AtomicAdd(float *ptr, double val)
+static inline void AtomicAddRelease(float *ptr, double val)
 {
-   AtomicAdd(ptr, static_cast<float>(val));
+   AtomicAddRelease(ptr, static_cast<float>(val));
 }
 
 template <typename T>
-std::enable_if_t<std::is_arithmetic_v<T>> AtomicInc(T *ptr)
+std::enable_if_t<std::is_arithmetic_v<T>> AtomicIncRelease(T *ptr)
 {
-   AtomicAdd(ptr, static_cast<T>(1));
+   AtomicAddRelease(ptr, static_cast<T>(1));
 }
 
 template <typename T, typename U>
@@ -264,10 +306,16 @@ auto AtomicAdd(T *ptr, const U &add) -> decltype(ptr->AtomicAdd(add))
    return ptr->AtomicAdd(add);
 }
 
+template <typename T, typename U>
+auto AtomicAddRelease(T *ptr, const U &add) -> decltype(ptr->AtomicAddRelease(add))
+{
+   return ptr->AtomicAddRelease(add);
+}
+
 template <typename T>
-auto AtomicInc(T *ptr) -> decltype(ptr->AtomicInc())
+auto AtomicIncRelease(T *ptr) -> decltype(ptr->AtomicIncRelease())
 {
-   return ptr->AtomicInc();
+   return ptr->AtomicIncRelease();
 }
 
 } // namespace Internal
diff --git a/hist/histv7/test/hist_atomic.cxx b/hist/histv7/test/hist_atomic.cxx
index 26a544d405bba..c1502d283b664 100644
--- a/hist/histv7/test/hist_atomic.cxx
+++ b/hist/histv7/test/hist_atomic.cxx
@@ -13,10 +13,10 @@ class RHistAtomic : public testing::Test {};
 using AtomicTypes = testing::Types<char, short, int, long, long long, float, double>;
 TYPED_TEST_SUITE(RHistAtomic, AtomicTypes);
 
-TYPED_TEST(RHistAtomic, AtomicInc)
+TYPED_TEST(RHistAtomic, AtomicIncRelease)
 {
    TypeParam a = 1;
-   ROOT::Experimental::Internal::AtomicInc(&a);
+   ROOT::Experimental::Internal::AtomicIncRelease(&a);
    EXPECT_EQ(a, 2);
 }
 
@@ -28,7 +28,15 @@ TYPED_TEST(RHistAtomic, AtomicAdd)
    EXPECT_EQ(a, 3);
 }
 
-// AtomicInc is implemented in terms of AtomicAdd, so it's sufficient to stress one of them.
+TYPED_TEST(RHistAtomic, AtomicAddRelease)
+{
+   TypeParam a = 1;
+   const TypeParam b = 2;
+   ROOT::Experimental::Internal::AtomicAddRelease(&a, b);
+   EXPECT_EQ(a, 3);
+}
+
+// AtomicInc* is implemented in terms of AtomicAdd*, so it's sufficient to stress one of them.
 TYPED_TEST(RHistAtomic, StressAtomicAdd)
 {
    static constexpr TypeParam Addend = 1;
@@ -47,25 +55,43 @@ TYPED_TEST(RHistAtomic, StressAtomicAdd)
    EXPECT_EQ(a, NAdds * Addend);
 }
 
-TEST(AtomicAdd, FloatDouble)
+TYPED_TEST(RHistAtomic, StressAtomicAddRelease)
+{
+   static constexpr TypeParam Addend = 1;
+   static constexpr std::size_t NThreads = 4;
+   // Reduce number of additions for char to avoid overflow.
+   static constexpr std::size_t NAddsPerThread = sizeof(TypeParam) == 1 ? 20 : 8000;
+   static constexpr std::size_t NAdds = NThreads * NAddsPerThread;
+
+   TypeParam a = 0;
+   StressInParallel(NThreads, [&] {
+      for (std::size_t i = 0; i < NAddsPerThread; i++) {
+         ROOT::Experimental::Internal::AtomicAddRelease(&a, Addend);
+      }
+   });
+
+   EXPECT_EQ(a, NAdds * Addend);
+}
+
+TEST(AtomicAddRelease, FloatDouble)
 {
    float a = 1;
    const double b = 2;
-   ROOT::Experimental::Internal::AtomicAdd(&a, b);
+   ROOT::Experimental::Internal::AtomicAddRelease(&a, b);
    EXPECT_EQ(a, 3);
 }
 
-TEST(RBinWithError, AtomicAdd)
+TEST(RBinWithError, AtomicAddRelease)
 {
    RBinWithError bin;
    bin.fSum = 1;
    bin.fSum2 = 2;
-   bin.AtomicAdd(1.5);
+   bin.AtomicAddRelease(1.5);
    EXPECT_EQ(bin.fSum, 2.5);
    EXPECT_EQ(bin.fSum2, 4.25);
 }
 
-TEST(RBinWithError, StressAtomicAdd)
+TEST(RBinWithError, StressAtomicAddRelease)
 {
    static constexpr double Addend = 2.0;
    static constexpr std::size_t NThreads = 4;
@@ -75,7 +101,7 @@ TEST(RBinWithError, StressAtomicAdd)
    RBinWithError bin;
    StressInParallel(NThreads, [&] {
       for (std::size_t i = 0; i < NAddsPerThread; i++) {
-         bin.AtomicAdd(Addend);
+         bin.AtomicAddRelease(Addend);
       }
    });
 
@@ -120,7 +146,7 @@ TEST(RBinWithError, StressAtomicLoad)
          }
       } else {
          for (std::size_t i = 0; i < NAddsPerThread; i++) {
-            bin.AtomicAdd(Addend);
+            bin.AtomicAddRelease(Addend);
          }
       }
    });
diff --git a/hist/histv7/test/hist_user.cxx b/hist/histv7/test/hist_user.cxx
index 462e8a7ed5a52..fea882bb4df25 100644
--- a/hist/histv7/test/hist_user.cxx
+++ b/hist/histv7/test/hist_user.cxx
@@ -54,11 +54,11 @@ struct User {
       return *this;
    }
 
-   void AtomicInc() { ROOT::Experimental::Internal::AtomicInc(&fValue); }
+   void AtomicIncRelease() { ROOT::Experimental::Internal::AtomicIncRelease(&fValue); }
 
-   void AtomicAdd(double w) { ROOT::Experimental::Internal::AtomicAdd(&fValue, w); }
+   void AtomicAddRelease(double w) { ROOT::Experimental::Internal::AtomicAddRelease(&fValue, w); }
 
-   void AtomicAdd(const UserWeight &w) { ROOT::Experimental::Internal::AtomicAdd(&fValue, w.fWeight); }
+   void AtomicAddRelease(const UserWeight &w) { ROOT::Experimental::Internal::AtomicAddRelease(&fValue, w.fWeight); }
 
    void AtomicAdd(const User &rhs) { ROOT::Experimental::Internal::AtomicAdd(&fValue, rhs.fValue); }
 
@@ -194,7 +194,7 @@ TEST(RHistEngineUser, FillUserWeightInvalidNumberOfArguments)
 
 TEST(RHistEngineUser, FillAtomic)
 {
-   // Unweighted filling with atomic instructions uses AtomicInc
+   // Unweighted filling with atomic instructions uses AtomicIncRelease
    static constexpr std::size_t Bins = 20;
    const RRegularAxis axis(Bins, {0, Bins});
    RHistEngine<User> engine({axis});
@@ -209,7 +209,7 @@ TEST(RHistEngineUser, FillAtomic)
 
 TEST(RHistEngineUser, FillAtomicWeight)
 {
-   // Weighted filling with atomic instructions uses AtomicAdd(double)
+   // Weighted filling with atomic instructions uses AtomicAddRelease(double)
    static constexpr std::size_t Bins = 20;
    const RRegularAxis axis(Bins, {0, Bins});
    RHistEngine<User> engine({axis});
@@ -224,7 +224,7 @@ TEST(RHistEngineUser, FillAtomicWeight)
 
 TEST(RHistEngineUser, FillAtomicUserWeight)
 {
-   // Weighted filling with user-defined weight and atomic instructions uses AtomicAdd(const UserWeight &)
+   // Weighted filling with user-defined weight and atomic instructions uses AtomicAddRelease(const UserWeight &)
    static constexpr std::size_t Bins = 20;
    const RRegularAxis axis(Bins, {0, Bins});
    RHistEngine<User> engine({axis});